[llvm] [X86] Cleanup check prefixes for any/zero_extend_vector_inreg_of_broadcast_from_memory.ll tests (PR #172043)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 08:31:28 PST 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/172043
Remove all unused prefixes plus update_llc_test_checks.py will warn about prefix clashes, so we don't need fallback prefixes any more
>From 0b8167b29c0cb19edd8f4bf62075cf2cf956ac74 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 12 Dec 2025 16:30:39 +0000
Subject: [PATCH] [X86] Cleanup check prefixes for
any/zero_extend_vector_inreg_of_broadcast_from_memory.ll tests
Remove all unused prefixes plus update_llc_test_checks.py will warn about prefix clashes, so we don't need fallback prefixes any more
---
...d_vector_inreg_of_broadcast_from_memory.ll | 1557 ++++++++---------
...d_vector_inreg_of_broadcast_from_memory.ll | 1534 ++++++++--------
2 files changed, 1528 insertions(+), 1563 deletions(-)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 89b5c33e3f27b..49eb82e8434cf 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1,18 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST
define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
@@ -36,13 +36,13 @@ define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.el
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
; AVX2: # %bb.0:
@@ -115,13 +115,13 @@ define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.el
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
; AVX2: # %bb.0:
@@ -193,13 +193,13 @@ define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.el
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
; AVX2: # %bb.0:
@@ -263,13 +263,13 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX2: # %bb.0:
@@ -343,17 +343,17 @@ define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
; AVX2: # %bb.0:
@@ -429,14 +429,14 @@ define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
; AVX2: # %bb.0:
@@ -508,14 +508,14 @@ define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
; AVX2: # %bb.0:
@@ -588,15 +588,15 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX2: # %bb.0:
@@ -692,13 +692,13 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX2: # %bb.0:
@@ -783,13 +783,13 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss (%rdi), %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
+; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
; AVX2: # %bb.0:
@@ -889,23 +889,23 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
; AVX2: # %bb.0:
@@ -992,19 +992,19 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX2: # %bb.0:
@@ -1088,19 +1088,19 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
; AVX2: # %bb.0:
@@ -1181,17 +1181,17 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
-; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
+; AVX1-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1283,21 +1283,21 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
; SSE42-NEXT: movdqa %xmm3, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
; AVX2: # %bb.0:
@@ -1375,16 +1375,16 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
; AVX2: # %bb.0:
@@ -1461,16 +1461,16 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1545,18 +1545,18 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
; AVX2: # %bb.0:
@@ -1630,17 +1630,17 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1740,17 +1740,17 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1865,25 +1865,25 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
; AVX2: # %bb.0:
@@ -1982,20 +1982,20 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
; AVX2: # %bb.0:
@@ -2093,20 +2093,20 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
; AVX2: # %bb.0:
@@ -2201,22 +2201,22 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
; AVX2: # %bb.0:
@@ -2314,20 +2314,20 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,1,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
; AVX2: # %bb.0:
@@ -2422,20 +2422,20 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,0,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
@@ -2539,21 +2539,21 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm3, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551360,18446744073709551615]
-; AVX-NEXT: vmovdqa (%rdi), %xmm1
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX-NEXT: vpblendvb %xmm0, 48(%rdi), %xmm1, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551360,18446744073709551615]
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vpblendvb %xmm0, 48(%rdi), %xmm1, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -2649,17 +2649,17 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
-; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
+; AVX1-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -2755,22 +2755,22 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,0,0,0]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX2: # %bb.0:
@@ -2861,18 +2861,18 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX2: # %bb.0:
@@ -2961,17 +2961,17 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX2: # %bb.0:
@@ -3063,20 +3063,20 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
@@ -3165,21 +3165,21 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa (%rdi), %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %xmm3
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -3267,16 +3267,16 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -3361,22 +3361,22 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps 48(%rdi), %xmm0
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm0
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
; AVX2: # %bb.0:
@@ -3464,20 +3464,20 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
@@ -3565,21 +3565,21 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa (%rdi), %xmm2
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -3662,16 +3662,16 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -3750,21 +3750,21 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa (%rdi), %xmm2
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -3846,18 +3846,18 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovapd (%rdi), %ymm0
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovapd (%rdi), %ymm0
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -3924,19 +3924,19 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; AVX2: # %bb.0:
@@ -4000,18 +4000,18 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
; AVX2: # %bb.0:
@@ -4075,18 +4075,18 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.e
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
; AVX2: # %bb.0:
@@ -4150,18 +4150,18 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -4225,19 +4225,19 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; SSE-NEXT: movdqa %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -4301,19 +4301,19 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss (%rdi), %ymm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss (%rdi), %ymm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
; AVX2: # %bb.0:
@@ -4379,19 +4379,19 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
; AVX2: # %bb.0:
@@ -4457,18 +4457,18 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -4534,19 +4534,19 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; SSE-NEXT: movdqa %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -4612,19 +4612,19 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
; AVX2: # %bb.0:
@@ -4690,18 +4690,18 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -4767,19 +4767,19 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i
; SSE-NEXT: movdqa %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -4845,18 +4845,18 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -4922,19 +4922,19 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i
; SSE-NEXT: movdqa %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -5000,107 +5000,107 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
; SSE-NEXT: movdqa %xmm2, 48(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq 16(%rdi), %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: movq %rcx, %r8
-; AVX-NEXT: movq %rcx, %r9
-; AVX-NEXT: movq %rcx, %r10
-; AVX-NEXT: movl %ecx, %r11d
-; AVX-NEXT: movl %ecx, %ebx
-; AVX-NEXT: vmovd %ecx, %xmm0
-; AVX-NEXT: shrl $8, %ecx
-; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shrl $16, %ebx
-; AVX-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX-NEXT: shrl $24, %r11d
-; AVX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX-NEXT: shrq $32, %r10
-; AVX-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX-NEXT: shrq $40, %r9
-; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
-; AVX-NEXT: shrq $48, %r8
-; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
-; AVX-NEXT: movq 24(%rdi), %rcx
-; AVX-NEXT: shrq $56, %rax
-; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX-NEXT: movl %ecx, %eax
-; AVX-NEXT: shrl $8, %eax
-; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX-NEXT: movl %ecx, %eax
-; AVX-NEXT: shrl $16, %eax
-; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-NEXT: movl %ecx, %eax
-; AVX-NEXT: shrl $24, %eax
-; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrq $32, %rax
-; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrq $40, %rax
-; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrq $48, %rax
-; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: shrq $56, %rcx
-; AVX-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $8, %ecx
-; AVX-NEXT: vmovd %eax, %xmm1
-; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $24, %ecx
-; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $40, %rcx
-; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $48, %rcx
-; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: shrq $56, %rax
-; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT: movl %ecx, %eax
-; AVX-NEXT: shrl $8, %eax
-; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT: movl %ecx, %eax
-; AVX-NEXT: shrl $16, %eax
-; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT: movl %ecx, %eax
-; AVX-NEXT: shrl $24, %eax
-; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrq $32, %rax
-; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrq $40, %rax
-; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrq $48, %rax
-; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrq $56, %rcx
-; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: movq 16(%rdi), %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: movq %rcx, %r8
+; AVX1-NEXT: movq %rcx, %r9
+; AVX1-NEXT: movq %rcx, %r10
+; AVX1-NEXT: movl %ecx, %r11d
+; AVX1-NEXT: movl %ecx, %ebx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $16, %ebx
+; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $24, %r11d
+; AVX1-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $32, %r10
+; AVX1-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $40, %r9
+; AVX1-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $48, %r8
+; AVX1-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: movq 24(%rdi), %rcx
+; AVX1-NEXT: shrq $56, %rax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: shrq $32, %rax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: shrq $40, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: shrq $48, %rax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: shrq $56, %rcx
+; AVX1-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $24, %ecx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $40, %rcx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq 8(%rdi), %rcx
+; AVX1-NEXT: shrq $56, %rax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: shrq $32, %rax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: shrq $40, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: shrq $48, %rax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shrq $56, %rcx
+; AVX1-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -5238,22 +5238,3 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1-ONLY: {{.*}}
-; AVX2-FAST: {{.*}}
-; AVX2-FAST-PERLANE: {{.*}}
-; AVX2-SLOW: {{.*}}
-; FALLBACK0: {{.*}}
-; FALLBACK1: {{.*}}
-; FALLBACK10: {{.*}}
-; FALLBACK11: {{.*}}
-; FALLBACK12: {{.*}}
-; FALLBACK13: {{.*}}
-; FALLBACK2: {{.*}}
-; FALLBACK3: {{.*}}
-; FALLBACK4: {{.*}}
-; FALLBACK5: {{.*}}
-; FALLBACK6: {{.*}}
-; FALLBACK7: {{.*}}
-; FALLBACK8: {{.*}}
-; FALLBACK9: {{.*}}
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 5b4cdd2feca06..d5a724139ffd3 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1,18 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST
define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
@@ -36,13 +36,13 @@ define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.el
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
; AVX2: # %bb.0:
@@ -115,13 +115,13 @@ define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.el
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
; AVX2: # %bb.0:
@@ -193,13 +193,13 @@ define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.el
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
; AVX2: # %bb.0:
@@ -263,13 +263,13 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX2: # %bb.0:
@@ -343,17 +343,17 @@ define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
; AVX2: # %bb.0:
@@ -429,14 +429,14 @@ define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
; AVX2: # %bb.0:
@@ -508,14 +508,14 @@ define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
; AVX2: # %bb.0:
@@ -588,15 +588,15 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX2: # %bb.0:
@@ -692,13 +692,13 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX2: # %bb.0:
@@ -783,13 +783,13 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; SSE42-NEXT: movdqa %xmm0, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss (%rdi), %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
+; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
; AVX2: # %bb.0:
@@ -889,23 +889,23 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
; AVX2: # %bb.0:
@@ -992,19 +992,19 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
; AVX2: # %bb.0:
@@ -1088,19 +1088,19 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
; AVX2: # %bb.0:
@@ -1181,17 +1181,17 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
-; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
+; AVX1-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1283,21 +1283,21 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.
; SSE42-NEXT: movdqa %xmm3, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
; AVX2: # %bb.0:
@@ -1375,16 +1375,16 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
; AVX2: # %bb.0:
@@ -1461,16 +1461,16 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1545,18 +1545,18 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
; AVX2: # %bb.0:
@@ -1630,17 +1630,17 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1740,17 +1740,17 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
; AVX2: # %bb.0:
@@ -1865,22 +1865,22 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
; AVX2: # %bb.0:
@@ -1991,22 +1991,22 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
-; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX1-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
; AVX2: # %bb.0:
@@ -2115,20 +2115,20 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
; AVX2: # %bb.0:
@@ -2241,22 +2241,22 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
-; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
; AVX2: # %bb.0:
@@ -2365,20 +2365,20 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
-; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
+; AVX1-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
; AVX2: # %bb.0:
@@ -2492,21 +2492,21 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm2, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967040,4294967295,4294967295,4294967040]
-; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967040,4294967295,4294967295,4294967040]
+; AVX1-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
; AVX2: # %bb.0:
@@ -2615,19 +2615,19 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
-; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
+; AVX1-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -2741,20 +2741,20 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
-; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vmovaps 32(%rsi), %ymm2
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615]
+; AVX1-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovaps 32(%rsi), %ymm2
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -2861,22 +2861,22 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
; SSE42-NEXT: movdqa %xmm1, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX2: # %bb.0:
@@ -2978,22 +2978,22 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
-; AVX-NEXT: vmovdqa (%rdi), %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX2-SLOW: # %bb.0:
@@ -3130,19 +3130,19 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX2-SLOW: # %bb.0:
@@ -3310,22 +3310,22 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX2-SLOW: # %bb.0:
@@ -3462,19 +3462,19 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -3577,19 +3577,19 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 32(%rsi), %ymm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovaps 32(%rsi), %ymm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -3690,25 +3690,25 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss (%rdi), %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX-NEXT: vmovaps 48(%rdi), %xmm2
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,0],ymm2[1,3],ymm3[4,4],ymm2[5,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,1,3]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm3, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX1-NEXT: vmovaps 48(%rdi), %xmm2
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,0],ymm2[1,3],ymm3[4,4],ymm2[5,7]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,1,3]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm3, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
; AVX2-SLOW: # %bb.0:
@@ -3839,23 +3839,23 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,0,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX2-SLOW: # %bb.0:
@@ -3987,20 +3987,20 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -4096,21 +4096,21 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vmovaps 32(%rsi), %ymm2
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vmovaps 32(%rsi), %ymm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -4202,19 +4202,19 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; SSE42-NEXT: movdqa %xmm2, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX2: # %bb.0:
@@ -4306,19 +4306,19 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovaps 32(%rsi), %ymm2
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
+; AVX1-NEXT: vmovaps 32(%rsi), %ymm2
+; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
; AVX2: # %bb.0:
@@ -4416,19 +4416,19 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; AVX2: # %bb.0:
@@ -4515,19 +4515,19 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
; AVX2: # %bb.0:
@@ -4614,19 +4614,19 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.e
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
; AVX2: # %bb.0:
@@ -4696,19 +4696,19 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -4776,19 +4776,19 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
; SSE-NEXT: movdqa %xmm3, (%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps %xmm2, 16(%rdx)
-; AVX-NEXT: vmovaps %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm2
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm3
+; AVX1-NEXT: vmovaps %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovaps %xmm3, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -4877,19 +4877,19 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
; AVX2: # %bb.0:
@@ -4979,20 +4979,20 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
; AVX2: # %bb.0:
@@ -5081,19 +5081,19 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
; SSE42-NEXT: movdqa %xmm1, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -5181,19 +5181,19 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
; SSE42-NEXT: movdqa %xmm3, (%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps %xmm2, 16(%rdx)
-; AVX-NEXT: vmovaps %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm2
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm3
+; AVX1-NEXT: vmovaps %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovaps %xmm3, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -5284,23 +5284,23 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
; AVX2: # %bb.0:
@@ -5376,22 +5376,22 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -5465,18 +5465,18 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i
; SSE-NEXT: movdqa %xmm3, (%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps %xmm2, 16(%rdx)
-; AVX-NEXT: vmovaps %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm2
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm3
+; AVX1-NEXT: vmovaps %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovaps %xmm3, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -5550,22 +5550,22 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
; SSE-NEXT: movdqa %xmm1, 16(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
+; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
+; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
; AVX2: # %bb.0:
@@ -5639,18 +5639,18 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i
; SSE-NEXT: movdqa %xmm3, (%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps %xmm2, 16(%rdx)
-; AVX-NEXT: vmovaps %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm2
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm3
+; AVX1-NEXT: vmovaps %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovaps %xmm3, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -5722,63 +5722,63 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
; SSE-NEXT: movdqa %xmm3, 32(%rdx)
; SSE-NEXT: retq
;
-; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: movq %rax, %r8
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: movl %eax, %r11d
-; AVX-NEXT: movl %eax, %ebx
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: shrl $8, %eax
-; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: shrl $16, %ebx
-; AVX-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX-NEXT: shrl $24, %r11d
-; AVX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX-NEXT: shrq $32, %r10
-; AVX-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX-NEXT: shrq $40, %r9
-; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
-; AVX-NEXT: shrq $48, %r8
-; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
-; AVX-NEXT: movq 8(%rdi), %rax
-; AVX-NEXT: shrq $56, %rcx
-; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $8, %ecx
-; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $24, %ecx
-; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $40, %rcx
-; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $48, %rcx
-; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shrq $56, %rax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps %xmm2, 16(%rdx)
-; AVX-NEXT: vmovaps %xmm3, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; AVX1-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq %rax, %r9
+; AVX1-NEXT: movq %rax, %r10
+; AVX1-NEXT: movl %eax, %r11d
+; AVX1-NEXT: movl %eax, %ebx
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: shrl $16, %ebx
+; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $24, %r11d
+; AVX1-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $32, %r10
+; AVX1-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $40, %r9
+; AVX1-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $48, %r8
+; AVX1-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: movq 8(%rdi), %rax
+; AVX1-NEXT: shrq $56, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $24, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $40, %rcx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrq $56, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps 16(%rsi), %xmm2
+; AVX1-NEXT: vmovaps 48(%rsi), %xmm3
+; AVX1-NEXT: vmovaps %xmm2, 16(%rdx)
+; AVX1-NEXT: vmovaps %xmm3, 48(%rdx)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx)
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
@@ -5879,19 +5879,3 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1-ONLY: {{.*}}
-; FALLBACK0: {{.*}}
-; FALLBACK1: {{.*}}
-; FALLBACK10: {{.*}}
-; FALLBACK11: {{.*}}
-; FALLBACK12: {{.*}}
-; FALLBACK13: {{.*}}
-; FALLBACK2: {{.*}}
-; FALLBACK3: {{.*}}
-; FALLBACK4: {{.*}}
-; FALLBACK5: {{.*}}
-; FALLBACK6: {{.*}}
-; FALLBACK7: {{.*}}
-; FALLBACK8: {{.*}}
-; FALLBACK9: {{.*}}
More information about the llvm-commits
mailing list