[llvm] c823517 - [NFC][Codegen][X86] zero_extend_vector_inreg.ll: add SSE4.2 runline

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 29 14:45:35 PST 2022


Author: Roman Lebedev
Date: 2022-12-30T01:44:15+03:00
New Revision: c823517ef5f2ac0fafede18782d089a8aa91ed7b

URL: https://github.com/llvm/llvm-project/commit/c823517ef5f2ac0fafede18782d089a8aa91ed7b
DIFF: https://github.com/llvm/llvm-project/commit/c823517ef5f2ac0fafede18782d089a8aa91ed7b.diff

LOG: [NFC][Codegen][X86] zero_extend_vector_inreg.ll: add SSE4.2 runline

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index e7ea3756b8f1..850eb01faf78 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1,26 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK1
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK3
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK4
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK6
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK7
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK8
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9
 
 define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec16_v2i8_to_v1i16_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec16_v2i8_to_v1i16_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec16_v2i8_to_v1i16_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec16_v2i8_to_v1i16_factor2:
 ; AVX:       # %bb.0:
@@ -73,15 +83,24 @@ define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.
 }
 
 define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec32_v4i8_to_v2i16_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec32_v4i8_to_v2i16_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec32_v4i8_to_v2i16_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec32_v4i8_to_v2i16_factor2:
 ; AVX:       # %bb.0:
@@ -134,17 +153,26 @@ define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.
 }
 
 define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec32_v4i8_to_v1i32_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec32_v4i8_to_v1i32_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec32_v4i8_to_v1i32_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec32_v4i8_to_v1i32_factor4:
 ; AVX:       # %bb.0:
@@ -197,16 +225,25 @@ define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.
 }
 
 define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec32_v2i16_to_v1i32_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec32_v2i16_to_v1i32_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec32_v2i16_to_v1i32_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec32_v2i16_to_v1i32_factor2:
 ; AVX:       # %bb.0:
@@ -261,15 +298,24 @@ define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias
 }
 
 define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec64_v8i8_to_v4i16_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec64_v8i8_to_v4i16_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec64_v8i8_to_v4i16_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec64_v8i8_to_v4i16_factor2:
 ; AVX:       # %bb.0:
@@ -322,16 +368,25 @@ define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.
 }
 
 define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec64_v8i8_to_v2i32_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec64_v8i8_to_v2i32_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec64_v8i8_to_v2i32_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec64_v8i8_to_v2i32_factor4:
 ; AVX:       # %bb.0:
@@ -384,17 +439,26 @@ define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.
 }
 
 define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec64_v8i8_to_v1i64_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec64_v8i8_to_v1i64_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec64_v8i8_to_v1i64_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec64_v8i8_to_v1i64_factor8:
 ; AVX:       # %bb.0:
@@ -447,15 +511,24 @@ define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.
 }
 
 define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec64_v4i16_to_v2i32_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec64_v4i16_to_v2i32_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec64_v4i16_to_v2i32_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec64_v4i16_to_v2i32_factor2:
 ; AVX:       # %bb.0:
@@ -510,16 +583,25 @@ define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias
 }
 
 define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec64_v4i16_to_v1i64_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec64_v4i16_to_v1i64_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec64_v4i16_to_v1i64_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec64_v4i16_to_v1i64_factor4:
 ; AVX:       # %bb.0:
@@ -574,15 +656,24 @@ define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias
 }
 
 define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec64_v2i32_to_v1i64_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec64_v2i32_to_v1i64_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec64_v2i32_to_v1i64_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec64_v2i32_to_v1i64_factor2:
 ; AVX:       # %bb.0:
@@ -637,15 +728,24 @@ define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias
 }
 
 define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v16i8_to_v8i16_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v16i8_to_v8i16_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v16i8_to_v8i16_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v16i8_to_v8i16_factor2:
 ; AVX:       # %bb.0:
@@ -698,16 +798,25 @@ define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v16i8_to_v4i32_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v16i8_to_v4i32_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v16i8_to_v4i32_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v16i8_to_v4i32_factor4:
 ; AVX:       # %bb.0:
@@ -760,17 +869,26 @@ define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec128_v16i8_to_v2i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v16i8_to_v2i64_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v16i8_to_v2i64_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v16i8_to_v2i64_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v16i8_to_v2i64_factor8:
 ; AVX:       # %bb.0:
@@ -883,15 +1001,24 @@ define void @vec128_v16i8_to_v1i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v8i16_to_v4i32_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v8i16_to_v4i32_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v8i16_to_v4i32_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v8i16_to_v4i32_factor2:
 ; AVX:       # %bb.0:
@@ -946,16 +1073,25 @@ define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v8i16_to_v2i64_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v8i16_to_v2i64_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v8i16_to_v2i64_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v8i16_to_v2i64_factor4:
 ; AVX:       # %bb.0:
@@ -1010,14 +1146,24 @@ define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v8i16_to_v1i128_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v8i16_to_v1i128_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v8i16_to_v1i128_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v8i16_to_v1i128_factor8:
 ; AVX:       # %bb.0:
@@ -1076,15 +1222,24 @@ define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v4i32_to_v2i64_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v4i32_to_v2i64_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v4i32_to_v2i64_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v4i32_to_v2i64_factor2:
 ; AVX:       # %bb.0:
@@ -1139,15 +1294,25 @@ define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec128_v4i32_to_v1i128_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec128_v4i32_to_v1i128_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec128_v4i32_to_v1i128_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_v4i32_to_v1i128_factor4:
 ; AVX:       # %bb.0:
@@ -1268,19 +1433,32 @@ define void @vec128_v2i64_to_v1i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v32i8_to_v16i16_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v32i8_to_v16i16_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v32i8_to_v16i16_factor2:
 ; AVX:       # %bb.0:
@@ -1337,20 +1515,33 @@ define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v32i8_to_v8i32_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v32i8_to_v8i32_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v32i8_to_v8i32_factor4:
 ; AVX:       # %bb.0:
@@ -1407,21 +1598,34 @@ define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v32i8_to_v4i64_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v32i8_to_v4i64_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    psrld $16, %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v32i8_to_v4i64_factor8:
 ; AVX:       # %bb.0:
@@ -1620,19 +1824,32 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v16i16_to_v8i32_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v16i16_to_v8i32_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v16i16_to_v8i32_factor2:
 ; AVX:       # %bb.0:
@@ -1691,20 +1908,33 @@ define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v16i16_to_v4i64_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v16i16_to_v4i64_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v16i16_to_v4i64_factor4:
 ; AVX:       # %bb.0:
@@ -1763,19 +1993,33 @@ define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v16i16_to_v2i128_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v16i16_to_v2i128_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v16i16_to_v2i128_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v16i16_to_v2i128_factor8:
 ; AVX:       # %bb.0:
@@ -1843,16 +2087,28 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v16i16_to_v1i256_factor16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movaps 16(%rdx), %xmm1
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movaps %xmm1, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v16i16_to_v1i256_factor16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v16i16_to_v1i256_factor16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    movaps 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v16i16_to_v1i256_factor16:
 ; AVX:       # %bb.0:
@@ -1913,19 +2169,32 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 }
 
 define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v8i32_to_v4i64_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v8i32_to_v4i64_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v8i32_to_v4i64_factor2:
 ; AVX:       # %bb.0:
@@ -1984,20 +2253,35 @@ define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v8i32_to_v2i128_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v8i32_to_v2i128_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v8i32_to_v2i128_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v8i32_to_v2i128_factor4:
 ; AVX:       # %bb.0:
@@ -2092,17 +2376,29 @@ define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec256_v8i32_to_v1i256_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT:    movaps 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm0, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec256_v8i32_to_v1i256_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec256_v8i32_to_v1i256_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    movaps 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec256_v8i32_to_v1i256_factor8:
 ; AVX:       # %bb.0:
@@ -2367,24 +2663,42 @@ define void @vec256_v2i128_to_v1i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v48i8_to_v24i16_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    paddb 16(%rsi), %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm3, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v48i8_to_v24i16_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    paddb 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    paddb 16(%rsi), %xmm1
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE42-NEXT:    pxor %xmm3, %xmm3
+; SSE42-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v48i8_to_v24i16_factor2:
 ; AVX:       # %bb.0:
@@ -2454,28 +2768,45 @@ define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v48i8_to_v16i24_factor3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    paddb 32(%rdx), %xmm2
-; SSE-NEXT:    paddb 16(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v48i8_to_v16i24_factor3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    paddb 32(%rdx), %xmm2
+; SSE2-NEXT:    paddb 16(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v48i8_to_v16i24_factor3:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v48i8_to_v16i24_factor3:
 ; AVX:       # %bb.0:
@@ -2546,25 +2877,42 @@ define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v48i8_to_v12i32_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm3, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v48i8_to_v12i32_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v48i8_to_v12i32_factor4:
 ; AVX:       # %bb.0:
@@ -2658,24 +3006,41 @@ define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v48i8_to_v8i48_factor6:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v48i8_to_v8i48_factor6:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v48i8_to_v8i48_factor6:
 ; AVX:       # %bb.0:
@@ -2749,26 +3114,43 @@ define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v48i8_to_v6i64_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm3, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v48i8_to_v6i64_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    psrld $16, %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v48i8_to_v6i64_factor8:
 ; AVX:       # %bb.0:
@@ -2862,28 +3244,45 @@ define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bia
 }
 
 define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v48i8_to_v4i96_factor12:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    paddb 16(%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v48i8_to_v4i96_factor12:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    paddb 16(%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v48i8_to_v4i96_factor12:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v48i8_to_v4i96_factor12:
 ; AVX:       # %bb.0:
@@ -3069,32 +3468,47 @@ define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
-  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 1, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
-  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
-  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
-  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
-  ret void
-}
-
-define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v48i8_to_v2i192_factor24:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    movaps 32(%rdx), %xmm2
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 1, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
+  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
+  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
+  ret void
+}
+
+define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
+; SSE2-LABEL: vec384_v48i8_to_v2i192_factor24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    movaps 32(%rdx), %xmm2
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v48i8_to_v2i192_factor24:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT:    movaps 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    paddb 16(%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v48i8_to_v2i192_factor24:
 ; AVX:       # %bb.0:
@@ -3239,24 +3653,42 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v24i16_to_v12i32_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    paddb 16(%rsi), %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm3, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v24i16_to_v12i32_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    paddb 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    paddb 16(%rsi), %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v24i16_to_v12i32_factor2:
 ; AVX:       # %bb.0:
@@ -3331,23 +3763,42 @@ define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v24i16_to_v8i48_factor3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v24i16_to_v8i48_factor3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v24i16_to_v8i48_factor3:
 ; AVX:       # %bb.0:
@@ -3491,25 +3942,42 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v24i16_to_v6i64_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm3, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v24i16_to_v6i64_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v24i16_to_v6i64_factor4:
 ; AVX:       # %bb.0:
@@ -3607,26 +4075,45 @@ define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v24i16_to_v4i96_factor6:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v24i16_to_v4i96_factor6:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,3],zero,zero
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    psrld $16, %xmm0
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6,7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v24i16_to_v4i96_factor6:
 ; AVX:       # %bb.0:
@@ -3769,23 +4256,41 @@ define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v24i16_to_v3i128_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    paddb 16(%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v24i16_to_v3i128_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    paddb 16(%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v24i16_to_v3i128_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v24i16_to_v3i128_factor8:
 ; AVX:       # %bb.0:
@@ -3932,22 +4437,39 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v24i16_to_v2i192_factor12:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    movaps 32(%rdx), %xmm2
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v24i16_to_v2i192_factor12:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    movaps 32(%rdx), %xmm2
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v24i16_to_v2i192_factor12:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE42-NEXT:    movaps 32(%rdx), %xmm1
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movaps %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v24i16_to_v2i192_factor12:
 ; AVX:       # %bb.0:
@@ -4020,18 +4542,32 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 }
 
 define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v24i16_to_v1i384_factor24:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movaps 16(%rdx), %xmm1
-; SSE-NEXT:    movaps 32(%rdx), %xmm2
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movaps %xmm1, 16(%rcx)
-; SSE-NEXT:    movaps %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v24i16_to_v1i384_factor24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; SSE2-NEXT:    movaps 32(%rdx), %xmm2
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
+; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v24i16_to_v1i384_factor24:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    movaps 16(%rdx), %xmm0
+; SSE42-NEXT:    movaps 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v24i16_to_v1i384_factor24:
 ; AVX:       # %bb.0:
@@ -4098,24 +4634,42 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.
 }
 
 define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v12i32_to_v6i64_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    paddb 16(%rsi), %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm3, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v12i32_to_v6i64_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    paddb 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    paddb 16(%rsi), %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v12i32_to_v6i64_factor2:
 ; AVX:       # %bb.0:
@@ -4193,25 +4747,44 @@ define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v12i32_to_v4i96_factor3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,4294967295,0]
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    paddb 16(%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v12i32_to_v4i96_factor3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,4294967295,0]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    paddb 16(%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v12i32_to_v4i96_factor3:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    paddb 32(%rdx), %xmm3
+; SSE42-NEXT:    paddb 16(%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v12i32_to_v4i96_factor3:
 ; AVX:       # %bb.0:
@@ -4340,25 +4913,44 @@ define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v12i32_to_v3i128_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v12i32_to_v3i128_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v12i32_to_v3i128_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb 32(%rdx), %xmm3
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v12i32_to_v3i128_factor4:
 ; AVX:       # %bb.0:
@@ -4490,22 +5082,39 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v12i32_to_v2i192_factor6:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSE-NEXT:    movaps 32(%rdx), %xmm0
-; SSE-NEXT:    paddb 16(%rdx), %xmm1
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movaps %xmm0, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v12i32_to_v2i192_factor6:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE2-NEXT:    movaps 32(%rdx), %xmm0
+; SSE2-NEXT:    paddb 16(%rdx), %xmm1
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movaps %xmm0, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v12i32_to_v2i192_factor6:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; SSE42-NEXT:    movaps 32(%rdx), %xmm1
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movaps %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v12i32_to_v2i192_factor6:
 ; AVX:       # %bb.0:
@@ -4611,19 +5220,33 @@ define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v12i32_to_v1i384_factor12:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT:    movaps 16(%rdx), %xmm0
-; SSE-NEXT:    movaps 32(%rdx), %xmm2
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm0, 16(%rcx)
-; SSE-NEXT:    movaps %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v12i32_to_v1i384_factor12:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps 16(%rdx), %xmm0
+; SSE2-NEXT:    movaps 32(%rdx), %xmm2
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v12i32_to_v1i384_factor12:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    movaps 16(%rdx), %xmm0
+; SSE42-NEXT:    movaps 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v12i32_to_v1i384_factor12:
 ; AVX:       # %bb.0:
@@ -4788,20 +5411,35 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec384_v6i64_to_v2i192_factor3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE-NEXT:    movaps 32(%rdx), %xmm2
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    paddb 16(%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec384_v6i64_to_v2i192_factor3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE2-NEXT:    movaps 32(%rdx), %xmm2
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    paddb 16(%rdx), %xmm1
+; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec384_v6i64_to_v2i192_factor3:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE42-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE42-NEXT:    movaps 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm0
+; SSE42-NEXT:    paddb 16(%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec384_v6i64_to_v2i192_factor3:
 ; AVX:       # %bb.0:
@@ -5018,28 +5656,49 @@ define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v64i8_to_v32i16_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    paddb 16(%rsi), %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm4
-; SSE-NEXT:    paddb 48(%rdx), %xmm1
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm1, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm4, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v64i8_to_v32i16_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    paddb 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm4
+; SSE2-NEXT:    paddb 48(%rdx), %xmm1
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    paddb 16(%rsi), %xmm1
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE42-NEXT:    pxor %xmm3, %xmm3
+; SSE42-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE42-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm4
+; SSE42-NEXT:    paddb 48(%rdx), %xmm1
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm4, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v64i8_to_v32i16_factor2:
 ; AVX:       # %bb.0:
@@ -5110,29 +5769,50 @@ define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v64i8_to_v16i32_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm4
-; SSE-NEXT:    paddb 48(%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm4, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v64i8_to_v16i32_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm4
+; SSE2-NEXT:    paddb 48(%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v64i8_to_v16i32_factor4:
 ; AVX:       # %bb.0:
@@ -5202,30 +5882,52 @@ define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v64i8_to_v8i64_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm4
-; SSE-NEXT:    paddb 48(%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm4, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v64i8_to_v8i64_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm4
+; SSE2-NEXT:    paddb 48(%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
+; SSE42-NEXT:    psrlq $48, %xmm3
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    psrld $16, %xmm0
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v64i8_to_v8i64_factor8:
 ; AVX:       # %bb.0:
@@ -5641,28 +6343,49 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v32i16_to_v16i32_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    paddb 16(%rsi), %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm4
-; SSE-NEXT:    paddb 48(%rdx), %xmm1
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm1, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm4, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v32i16_to_v16i32_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    paddb 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm4
+; SSE2-NEXT:    paddb 48(%rdx), %xmm1
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    paddb 16(%rsi), %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE42-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm4
+; SSE42-NEXT:    paddb 48(%rdx), %xmm1
+; SSE42-NEXT:    paddb 32(%rdx), %xmm3
+; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm4, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v32i16_to_v16i32_factor2:
 ; AVX:       # %bb.0:
@@ -5734,29 +6457,50 @@ define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v32i16_to_v8i64_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm4
-; SSE-NEXT:    paddb 48(%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm4, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v32i16_to_v8i64_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm4
+; SSE2-NEXT:    paddb 48(%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v32i16_to_v8i64_factor4:
 ; AVX:       # %bb.0:
@@ -5828,27 +6572,49 @@ define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v32i16_to_v4i128_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    paddb 16(%rdx), %xmm3
-; SSE-NEXT:    paddb 48(%rdx), %xmm2
-; SSE-NEXT:    paddb 32(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm3, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v32i16_to_v4i128_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    paddb 16(%rdx), %xmm3
+; SSE2-NEXT:    paddb 48(%rdx), %xmm2
+; SSE2-NEXT:    paddb 32(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v32i16_to_v4i128_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    paddb 16(%rdx), %xmm3
+; SSE42-NEXT:    paddb 48(%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v32i16_to_v4i128_factor8:
 ; AVX:       # %bb.0:
@@ -5988,23 +6754,41 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v32i16_to_v2i256_factor16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    movaps 16(%rdx), %xmm2
-; SSE-NEXT:    movaps 48(%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm3, 48(%rcx)
-; SSE-NEXT:    movaps %xmm2, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v32i16_to_v2i256_factor16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movaps 16(%rdx), %xmm2
+; SSE2-NEXT:    movaps 48(%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE2-NEXT:    movaps %xmm2, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v32i16_to_v2i256_factor16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    movaps 16(%rdx), %xmm2
+; SSE42-NEXT:    movaps 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE42-NEXT:    movaps %xmm2, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v32i16_to_v2i256_factor16:
 ; AVX:       # %bb.0:
@@ -6077,20 +6861,36 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 }
 
 define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v32i16_to_v1i512_factor32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movaps 16(%rdx), %xmm1
-; SSE-NEXT:    movaps 32(%rdx), %xmm2
-; SSE-NEXT:    movaps 48(%rdx), %xmm3
-; SSE-NEXT:    paddb (%rdx), %xmm0
-; SSE-NEXT:    movaps %xmm2, 32(%rcx)
-; SSE-NEXT:    movaps %xmm3, 48(%rcx)
-; SSE-NEXT:    movaps %xmm1, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm0, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v32i16_to_v1i512_factor32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; SSE2-NEXT:    movaps 32(%rdx), %xmm2
+; SSE2-NEXT:    movaps 48(%rdx), %xmm3
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v32i16_to_v1i512_factor32:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE42-NEXT:    movaps 16(%rdx), %xmm0
+; SSE42-NEXT:    movaps 32(%rdx), %xmm2
+; SSE42-NEXT:    movaps 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v32i16_to_v1i512_factor32:
 ; AVX:       # %bb.0:
@@ -6155,28 +6955,49 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.
 }
 
 define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v16i32_to_v8i64_factor2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    paddb 16(%rsi), %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm4
-; SSE-NEXT:    paddb 48(%rdx), %xmm1
-; SSE-NEXT:    paddb 32(%rdx), %xmm3
-; SSE-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm1, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm4, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v16i32_to_v8i64_factor2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    paddb 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm4
+; SSE2-NEXT:    paddb 48(%rdx), %xmm1
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    paddb 16(%rsi), %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm4
+; SSE42-NEXT:    paddb 48(%rdx), %xmm1
+; SSE42-NEXT:    paddb 32(%rdx), %xmm3
+; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm4, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2:
 ; AVX:       # %bb.0:
@@ -6255,29 +7076,51 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 }
 
 define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v16i32_to_v4i128_factor4:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    xorps %xmm3, %xmm3
-; SSE-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb 32(%rdx), %xmm4
-; SSE-NEXT:    paddb (%rdx), %xmm3
-; SSE-NEXT:    paddb 48(%rdx), %xmm2
-; SSE-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm3, (%rcx)
-; SSE-NEXT:    movdqa %xmm4, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v16i32_to_v4i128_factor4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    xorps %xmm3, %xmm3
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb 32(%rdx), %xmm4
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    paddb 48(%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    movdqa %xmm4, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v16i32_to_v4i128_factor4:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pxor %xmm4, %xmm4
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
+; SSE42-NEXT:    paddb 16(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    paddb 48(%rdx), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v16i32_to_v4i128_factor4:
 ; AVX:       # %bb.0:
@@ -6401,24 +7244,43 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v16i32_to_v2i256_factor8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; SSE-NEXT:    movaps 16(%rdx), %xmm1
-; SSE-NEXT:    movaps 48(%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm0
-; SSE-NEXT:    paddb (%rdx), %xmm2
-; SSE-NEXT:    movaps %xmm3, 48(%rcx)
-; SSE-NEXT:    movaps %xmm1, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm2, (%rcx)
-; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v16i32_to_v2i256_factor8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; SSE2-NEXT:    movaps 48(%rdx), %xmm3
+; SSE2-NEXT:    paddb 32(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v16i32_to_v2i256_factor8:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    movaps 16(%rdx), %xmm1
+; SSE42-NEXT:    movaps 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm0
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE42-NEXT:    movaps %xmm1, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v16i32_to_v2i256_factor8:
 ; AVX:       # %bb.0:
@@ -6535,21 +7397,37 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 }
 
 define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v16i32_to_v1i512_factor16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT:    movaps 16(%rdx), %xmm0
-; SSE-NEXT:    movaps 32(%rdx), %xmm2
-; SSE-NEXT:    movaps 48(%rdx), %xmm3
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movaps %xmm2, 32(%rcx)
-; SSE-NEXT:    movaps %xmm3, 48(%rcx)
-; SSE-NEXT:    movaps %xmm0, 16(%rcx)
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v16i32_to_v1i512_factor16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps 16(%rdx), %xmm0
+; SSE2-NEXT:    movaps 32(%rdx), %xmm2
+; SSE2-NEXT:    movaps 48(%rdx), %xmm3
+; SSE2-NEXT:    paddb (%rdx), %xmm1
+; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE2-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v16i32_to_v1i512_factor16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE42-NEXT:    movaps 16(%rdx), %xmm0
+; SSE42-NEXT:    movaps 32(%rdx), %xmm2
+; SSE42-NEXT:    movaps 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
+; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
+; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v16i32_to_v1i512_factor16:
 ; AVX:       # %bb.0:
@@ -7130,3 +8008,4 @@ define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; FALLBACK6: {{.*}}
 ; FALLBACK7: {{.*}}
 ; FALLBACK8: {{.*}}
+; FALLBACK9: {{.*}}


        


More information about the llvm-commits mailing list