[llvm] r288804 - [X86][AVX512] Detect repeated constant patterns in BUILD_VECTOR suitable for broadcasting.
Ayman Musa via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 6 04:24:14 PST 2016
Author: aymanmus
Date: Tue Dec 6 06:24:14 2016
New Revision: 288804
URL: http://llvm.org/viewvc/llvm-project?rev=288804&view=rev
Log:
[X86][AVX512] Detect repeated constant patterns in BUILD_VECTOR suitable for broadcasting.
Check if a build_vector node includes a repeated constant pattern and replace it with a broadcast of that pattern.
For example:
"build_vector <0, 1, 2, 3, 0, 1, 2, 3>" would be replaced by "broadcast <0, 1, 2, 3>"
Differential Revision: https://reviews.llvm.org/D26802
Added:
llvm/trunk/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avg.ll
llvm/trunk/test/CodeGen/X86/vec_shift6.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=288804&r1=288803&r2=288804&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Dec 6 06:24:14 2016
@@ -6316,8 +6316,47 @@ static SDValue EltsFromConsecutiveLoads(
return SDValue();
}
-/// Attempt to use the vbroadcast instruction to generate a splat value for a
-/// splat BUILD_VECTOR which uses a single scalar load, or a constant.
+static Constant *getConstantVector(MVT VT, APInt SplatValue,
+ unsigned SplatBitSize, LLVMContext &C) {
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ unsigned NumElm = SplatBitSize / ScalarSize;
+
+ SmallVector<Constant *, 32> ConstantVec;
+ for (unsigned i = 0; i < NumElm; i++) {
+ APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+ Constant *Const;
+ if (VT.isFloatingPoint()) {
+ assert((ScalarSize == 32 || ScalarSize == 64) &&
+ "Unsupported floating point scalar size");
+ if (ScalarSize == 32)
+ Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
+ else
+ Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+ } else
+ Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
+ ConstantVec.push_back(Const);
+ }
+ return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
+}
+
+static bool isUseOfShuffle(SDNode *N) {
+ for (auto *U : N->uses()) {
+ if (isTargetShuffle(U->getOpcode()))
+ return true;
+ if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
+ return isUseOfShuffle(U);
+ }
+ return false;
+}
+
+/// Attempt to use the vbroadcast instruction to generate a splat value for the
+/// following cases:
+/// 1. A splat BUILD_VECTOR which uses:
+/// a. A single scalar load, or a constant.
+/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+///
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
@@ -6339,8 +6378,82 @@ static SDValue LowerVectorBroadcast(Buil
// We need a splat of a single value to use broadcast, and it doesn't
// make any sense if the value is only in one element of the vector.
- if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
+ if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
+ APInt SplatValue, Undef;
+ unsigned SplatBitSize;
+ bool HasUndef;
+ // Check if this is a repeated constant pattern suitable for broadcasting.
+ if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
+ SplatBitSize > VT.getScalarSizeInBits() &&
+ SplatBitSize < VT.getSizeInBits()) {
+ // Avoid replacing with broadcast when it's a use of a shuffle
+ // instruction to preserve the present custom lowering of shuffles.
+ if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+ return SDValue();
+ // replace BUILD_VECTOR with broadcast of the repeated constants.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ LLVMContext *Ctx = DAG.getContext();
+ MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (Subtarget.hasAVX()) {
+ if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
+ !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+ // Splatted value can fit in one INTEGER constant in constant pool.
+ // Load the constant and broadcast it.
+ MVT CVT = MVT::getIntegerVT(SplatBitSize);
+ Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
+ Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
+ SDValue CP = DAG.getConstantPool(C, PVT);
+ unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+ MVT::getVectorVT(CVT, Repeat), Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ } else if (SplatBitSize == 32 || SplatBitSize == 64) {
+ // Splatted value can fit in one FLOAT constant in constant pool.
+ // Load the constant and broadcast it.
+ // AVX have support for 32 and 64 bit broadcast for floats only.
+ // No 64bit integer in 32bit subtarget.
+ MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
+ Constant *C = SplatBitSize == 32
+ ? ConstantFP::get(Type::getFloatTy(*Ctx),
+ SplatValue.bitsToFloat())
+ : ConstantFP::get(Type::getDoubleTy(*Ctx),
+ SplatValue.bitsToDouble());
+ SDValue CP = DAG.getConstantPool(C, PVT);
+ unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+ MVT::getVectorVT(CVT, Repeat), Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ } else if (SplatBitSize > 64) {
+ // Load the vector of constants and broadcast it.
+ MVT CVT = VT.getScalarType();
+ Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
+ *Ctx);
+ SDValue VCP = DAG.getConstantPool(VecC, PVT);
+ unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+ unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+ Ld = DAG.getLoad(
+ MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ }
+ }
+ }
return SDValue();
+ }
bool ConstSplatVal =
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
Modified: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=288804&r1=288803&r2=288804&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Tue Dec 6 06:24:14 2016
@@ -2132,7 +2132,7 @@ define void @avg_v64i8_const(<64 x i8>*
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
@@ -2405,7 +2405,7 @@ define void @avg_v32i16_const(<32 x i16>
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
Added: llvm/trunk/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll?rev=288804&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll (added)
+++ llvm/trunk/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll Tue Dec 6 06:24:14 2016
@@ -0,0 +1,1205 @@
+; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py
+; Assertions for constant pools have been added MANUALLY.
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64
+
+;===-----------------------------------------------------------------------------===
+; This test checks the ability to recognize a cross element pattern of
+; constants and perform the load via broadcasting a smaller constant
+; vector.
+; For example:
+; <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1>
+;===-----------------------------------------------------------------------------===
+
+; ALL: LCPI0
+; ALL-NEXT: .short 256 # 0x100
+
+define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
+; ALL32-LABEL: f16xi8_i16:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f16xi8_i16:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: retq
+ %res1 = add <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
+ %res2 = and <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
+ ret <16 x i8> %res2
+}
+
+
+; ALL: .LCPI1
+; ALL-NEXT: .long 50462976 # 0x3020100
+
+; AVX: .LCPI1
+; AVX-NEXT .long 50462976 # float 3.82047143E-37
+
+define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
+; ALL32-LABEL: f16xi8_i32:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f16xi8_i32:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f16xi8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+ %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+ %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+ ret <16 x i8> %res2
+}
+
+
+; ALL64: .LCPI2
+; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
+
+; AVX: .LCPI2
+; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
+
+define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
+; ALL32-LABEL: f16xi8_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f16xi8_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f16xi8_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+ %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+ %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+ ret <16 x i8> %res2
+}
+
+
+; ALL: .LCPI3
+; ALL-NEXT: .short 256 # 0x100
+
+define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
+; ALL32-LABEL: f32xi8_i16:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f32xi8_i16:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
+ %res1 = add <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
+ %res2 = and <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
+ ret <32 x i8> %res2
+}
+
+
+; ALL: .LCPI4
+; ALL-NEXT: .long 50462976 # 0x3020100
+
+; AVX: .LCPI4
+; AVX-NEXT: .long 50462976 # float 3.82047143E-37
+
+define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
+; ALL32-LABEL: f32xi8_i32:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f32xi8_i32:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f32xi8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+ %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+ %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+ ret <32 x i8> %res2
+}
+
+
+; ALL64: .LCPI5
+; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
+
+; AVX: .LCPI5
+; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
+
+define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
+; ALL32-LABEL: f32xi8_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastq {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f32xi8_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f32xi8_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+ %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+ %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+ ret <32 x i8> %res2
+}
+
+
+; ALL: .LCPI6
+; ALL-NEXT: .byte 0 # 0x0
+; ALL-NEXT: .byte 1 # 0x1
+; ALL-NEXT: .byte 2 # 0x2
+; ALL-NEXT: .byte 3 # 0x3
+; ALL-NEXT: .byte 4 # 0x4
+; ALL-NEXT: .byte 5 # 0x5
+; ALL-NEXT: .byte 6 # 0x6
+; ALL-NEXT: .byte 7 # 0x7
+; ALL-NEXT: .byte 8 # 0x8
+; ALL-NEXT: .byte 9 # 0x9
+; ALL-NEXT: .byte 10 # 0xa
+; ALL-NEXT: .byte 11 # 0xb
+; ALL-NEXT: .byte 12 # 0xc
+; ALL-NEXT: .byte 13 # 0xd
+; ALL-NEXT: .byte 14 # 0xe
+; ALL-NEXT: .byte 15 # 0xf
+; ALL-NOT: .byte
+
+define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
+; ALL-LABEL: f32xi8_i128:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+ %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
+ %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
+ ret <32 x i8> %res2
+}
+
+
+; ALL: .LCPI7
+; ALL-NEXT: .short 256 # 0x100
+
+define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64xi8_i16:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64xi8_i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
+ ret <64 x i8> %res2
+}
+
+
+; ALL: .LCPI8
+; ALL-NEXT: .long 50462976 # 0x3020100
+
+; AVX: .LCPI8
+; AVX-NEXT: .long 50462976 # float 3.82047143E-37
+
+define <64 x i8> @f64i8_i32(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64i8_i32:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64i8_i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f64i8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+ ret <64 x i8> %res2
+}
+
+
+; ALL64: .LCPI9
+; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
+
+; ALL32: .LCPI9
+; ALL32-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
+
+; AVX: .LCPI9
+; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
+
+define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64xi8_i64:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64xi8_i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f64xi8_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+ ret <64 x i8> %res2
+}
+
+
+; ALL: .LCPI10
+; ALL-NEXT: .byte 0 # 0x0
+; ALL-NEXT: .byte 1 # 0x1
+; ALL-NEXT: .byte 2 # 0x2
+; ALL-NEXT: .byte 3 # 0x3
+; ALL-NEXT: .byte 4 # 0x4
+; ALL-NEXT: .byte 5 # 0x5
+; ALL-NEXT: .byte 6 # 0x6
+; ALL-NEXT: .byte 7 # 0x7
+; ALL-NEXT: .byte 8 # 0x8
+; ALL-NEXT: .byte 9 # 0x9
+; ALL-NEXT: .byte 10 # 0xa
+; ALL-NEXT: .byte 11 # 0xb
+; ALL-NEXT: .byte 12 # 0xc
+; ALL-NEXT: .byte 13 # 0xd
+; ALL-NEXT: .byte 14 # 0xe
+; ALL-NEXT: .byte 15 # 0xf
+; ALL-NOT: .byte
+
+define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64xi8_i128:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64xi8_i128:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
+ ret <64 x i8> %res2
+}
+
+
+; AVX512BW: .LCPI11
+; AVX512BW-NEXT: .byte 0 # 0x0
+; AVX512BW-NEXT: .byte 1 # 0x1
+; AVX512BW-NEXT: .byte 2 # 0x2
+; AVX512BW-NEXT: .byte 3 # 0x3
+; AVX512BW-NEXT: .byte 4 # 0x4
+; AVX512BW-NEXT: .byte 5 # 0x5
+; AVX512BW-NEXT: .byte 6 # 0x6
+; AVX512BW-NEXT: .byte 7 # 0x7
+; AVX512BW-NEXT: .byte 8 # 0x8
+; AVX512BW-NEXT: .byte 9 # 0x9
+; AVX512BW-NEXT: .byte 10 # 0xa
+; AVX512BW-NEXT: .byte 11 # 0xb
+; AVX512BW-NEXT: .byte 12 # 0xc
+; AVX512BW-NEXT: .byte 13 # 0xd
+; AVX512BW-NEXT: .byte 14 # 0xe
+; AVX512BW-NEXT: .byte 15 # 0xf
+; AVX512BW-NEXT: .byte 16 # 0x10
+; AVX512BW-NEXT: .byte 17 # 0x11
+; AVX512BW-NEXT: .byte 18 # 0x12
+; AVX512BW-NEXT: .byte 19 # 0x13
+; AVX512BW-NEXT: .byte 20 # 0x14
+; AVX512BW-NEXT: .byte 21 # 0x15
+; AVX512BW-NEXT: .byte 22 # 0x16
+; AVX512BW-NEXT: .byte 23 # 0x17
+; AVX512BW-NEXT: .byte 24 # 0x18
+; AVX512BW-NEXT: .byte 25 # 0x19
+; AVX512BW-NEXT: .byte 26 # 0x1a
+; AVX512BW-NEXT: .byte 27 # 0x1b
+; AVX512BW-NEXT: .byte 28 # 0x1c
+; AVX512BW-NEXT: .byte 29 # 0x1d
+; AVX512BW-NEXT: .byte 30 # 0x1e
+; AVX512BW-NEXT: .byte 31 # 0x1f
+; AVX512BW-NOT: .byte
+
+define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
+; AVX512BW-LABEL: f64xi8_i256:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1
+ ret <64 x i8> %res2
+}
+
+
+; ALL: .LCPI12
+; ALL-NEXT: .long 65536 # 0x10000
+
+; AVX: .LCPI12
+; AVX-NEXT: .long 65536 # float 9.18354962E-41
+
+define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
+; ALL32-LABEL: f8xi16_i32:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f8xi16_i32:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f8xi16_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+ %res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+ %res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+ ret <8 x i16> %res2
+}
+
+
+; ALL64: .LCPI13
+; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
+
+; ALL32: .LCPI13
+; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
+
+; AVX: .LCPI13
+; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
+
+define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
+; ALL32-LABEL: f8xi16_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f8xi16_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f8xi16_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+ %res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+ %res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+ ret <8 x i16> %res2
+}
+
+
+; ALL: .LCPI14
+; ALL-NEXT: .long 65536 # 0x10000
+
+; AVX: .LCPI14
+; AVX-NEXT: .long 65536 # float 9.18354962E-41
+
+define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
+; ALL-LABEL: f16xi16_i32:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
+; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+;
+; AVX-LABEL: f16xi16_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2
+; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+ %res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+ %res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+ ret <16 x i16> %res2
+}
+
+
+; ALL64: .LCPI15
+; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
+
+; ALL32: .LCPI15
+; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
+
+; AVX: .LCPI15
+; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
+
+define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
+; ALL-LABEL: f16xi16_i64:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1
+; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+;
+; AVX-LABEL: f16xi16_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+ %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+ %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+ ret <16 x i16> %res2
+}
+
+
+; ALL: .LCPI16
+; ALL-NEXT: .short 0 # 0x0
+; ALL-NEXT: .short 1 # 0x1
+; ALL-NEXT: .short 2 # 0x2
+; ALL-NEXT: .short 3 # 0x3
+; ALL-NEXT: .short 4 # 0x4
+; ALL-NEXT: .short 5 # 0x5
+; ALL-NEXT: .short 6 # 0x6
+; ALL-NEXT: .short 7 # 0x7
+; ALL-NOT: .short
+
+define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
+; ALL-LABEL: f16xi16_i128:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+ %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
+ %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
+ ret <16 x i16> %res2
+}
+
+
+; ALL: .LCPI17
+; ALL-NEXT: .long 65536 # 0x10000
+
+; AVX: .LCPI17
+; AVX-NEXT: .long 65536 # float 9.18354962E-41
+
+define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
+; NO-AVX512BW-LABEL: f32xi16_i32:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f32xi16_i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f32xi16_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+ ret <32 x i16> %res2
+}
+
+
+; ALL64: .LCPI18
+; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
+
+; ALL32: .LCPI18
+; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
+
+; AVX: .LCPI18
+; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
+
+define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
+; NO-AVX512BW-LABEL: f32xi16_i64:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f32xi16_i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f32xi16_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+ ret <32 x i16> %res2
+}
+
+
+; ALL: .LCPI19
+; ALL-NEXT: .short 0 # 0x0
+; ALL-NEXT: .short 1 # 0x1
+; ALL-NEXT: .short 2 # 0x2
+; ALL-NEXT: .short 3 # 0x3
+; ALL-NEXT: .short 4 # 0x4
+; ALL-NEXT: .short 5 # 0x5
+; ALL-NEXT: .short 6 # 0x6
+; ALL-NEXT: .short 7 # 0x7
+; ALL-NOT: .short
+
+define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
+; NO-AVX512BW-LABEL: f32xi16_i128:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f32xi16_i128:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
+ ret <32 x i16> %res2
+}
+
+
+; AVX512BW: .LCPI20
+; AVX512BW-NEXT: .short 0 # 0x0
+; AVX512BW-NEXT: .short 1 # 0x1
+; AVX512BW-NEXT: .short 2 # 0x2
+; AVX512BW-NEXT: .short 3 # 0x3
+; AVX512BW-NEXT: .short 4 # 0x4
+; AVX512BW-NEXT: .short 5 # 0x5
+; AVX512BW-NEXT: .short 6 # 0x6
+; AVX512BW-NEXT: .short 7 # 0x7
+; AVX512BW-NEXT: .short 8 # 0x8
+; AVX512BW-NEXT: .short 9 # 0x9
+; AVX512BW-NEXT: .short 10 # 0xa
+; AVX512BW-NEXT: .short 11 # 0xb
+; AVX512BW-NEXT: .short 12 # 0xc
+; AVX512BW-NEXT: .short 13 # 0xd
+; AVX512BW-NEXT: .short 14 # 0xe
+; AVX512BW-NEXT: .short 15 # 0xf
+; AVX512BW-NOT: .short
+
+define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
+; AVX512BW-LABEL: f32xi16_i256:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1
+ ret <32 x i16> %res2
+}
+
+
+; ALL64: .LCPI21
+; ALL64-NEXT: .quad 4294967296 # 0x100000000
+
+; ALL32: .LCPI21
+; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
+
+; AVX: .LCPI21
+; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
+
+define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
+; ALL32-LABEL: f4xi32_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f4xi32_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f4xi32_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+ %res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a
+ %res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1
+ ret <4 x i32> %res2
+}
+
+
+; ALL64: .LCPI22
+; ALL64-NEXT: .quad 4294967296 # 0x100000000
+
+; ALL32: .LCPI22
+; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
+
+; AVX: .LCPI22
+; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
+
+define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
+; ALL-LABEL: f8xi32_i64:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1
+; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+;
+; AVX-LABEL: f8xi32_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+ %res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
+ %res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
+ ret <8 x i32> %res2
+}
+
+
+; ALL: .LCPI23
+; ALL-NEXT: .long 0 # 0x0
+; ALL-NEXT: .long 1 # 0x1
+; ALL-NEXT: .long 2 # 0x2
+; ALL-NEXT: .long 3 # 0x3
+; ALL-NOT: .long
+
+define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
+; ALL-LABEL: f8xi32_i128:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+ %res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
+ %res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
+ ret <8 x i32> %res2
+}
+
+
+; ALL64: .LCPI24
+; ALL64-NEXT: .quad 4294967296 # 0x100000000
+
+; ALL32: .LCPI24
+; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
+
+; AVX: .LCPI24
+; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
+
+define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
+; AVX2-LABEL: f16xi32_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512-LABEL: f16xi32_i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f16xi32_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+ %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
+ %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
+ ret <16 x i32> %res2
+}
+
+
+; ALL: .LCPI25
+; ALL-NEXT: .long 0 # 0x0
+; ALL-NEXT: .long 1 # 0x1
+; ALL-NEXT: .long 2 # 0x2
+; ALL-NEXT: .long 3 # 0x3
+; ALL-NOT: .long
+
+define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
+; AVX2-LABEL: f16xi32_i128:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512-LABEL: f16xi32_i128:
+; AVX512: # BB#0:
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+ %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
+ %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
+ ret <16 x i32> %res2
+}
+
+
+; ALL64: .LCPI26
+; ALL64-NEXT: .quad 0 # 0x0
+; ALL64-NEXT: .quad 1 # 0x1
+; ALL64-NOT: .quad
+
+define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
+; ALL64-LABEL: f4xi64_i128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
+ %res1 = add <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %a
+ %res2 = and <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %res1
+ ret <4 x i64> %res2
+}
+
+
+; ALL64: .LCPI27
+; ALL64-NEXT: .quad 0 # 0x0
+; ALL64-NEXT: .quad 1 # 0x1
+; ALL64-NOT: .quad
+
+define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
+; AVX2-64-LABEL: f8xi64_i128:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f8xi64_i128:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f8xi64_i128:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a
+ %res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1
+ ret <8 x i64> %res2
+}
+
+
+; ALL64: .LCPI28
+; ALL64-NEXT: .quad 0 # 0x0
+; ALL64-NEXT: .quad 1 # 0x1
+; ALL64-NEXT: .quad 2 # 0x2
+; ALL64-NEXT: .quad 3 # 0x3
+; ALL64-NOT: .quad
+
+define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
+; AVX512F-64-LABEL: f8xi64_i256:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f8xi64_i256:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a
+ %res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1
+ ret <8 x i64> %res2
+}
+
+
+; ALL: .LCPI29
+; ALL-NEXT: .quad 4575657222482165760
+
+; AVX: .LCPI29
+; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
+
+define <4 x float> @f4xf32_f64(<4 x float> %a) {
+; ALL32-LABEL: f4xf32_f64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; ALL32-NEXT: retl
+;
+; ALL64-LABEL: f4xf32_f64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; ALL64-NEXT: retq
+;
+; AVX-LABEL: f4xf32_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
+ %res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a
+ %res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1
+ ret <4 x float> %res2
+}
+
+
+; ALL64: .LCPI30
+; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000
+
+; ALL32: .LCPI30
+; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
+
+; AVX: .LCPI30
+; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
+
+define <8 x float> @f8xf32_f64(<8 x float> %a) {
+; ALL-LABEL: f8xf32_f64:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd {{.*}}, %ymm1
+; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0
+;
+; AVX-LABEL: f8xf32_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm1
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+ %res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
+ %res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
+ ret <8 x float> %res2
+}
+
+
+; ALL: .LCPI31
+; ALL-NEXT: .long 1082130432 # float 4
+; ALL-NEXT: .long 1065353216 # float 1
+; ALL-NEXT: .long 1073741824 # float 2
+; ALL-NEXT: .long 1077936128 # float 3
+; ALL-NOT: .long
+
+define <8 x float> @f8xf32_f128(<8 x float> %a) {
+; ALL-LABEL: f8xf32_f128:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0
+;
+; AVX-LABEL: f8xf32_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+ %res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
+ %res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
+ ret <8 x float> %res2
+}
+
+
+; ALL64: .LCPI32
+; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000
+
+; ALL32: .LCPI32
+; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
+
+; AVX: .LCPI32
+; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
+
+define <16 x float> @f16xf32_f64(<16 x float> %a) {
+; AVX2-LABEL: f16xf32_f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd {{.*}}, %ymm2
+; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+;
+; AVX512-LABEL: f16xf32_f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vbroadcastsd {{.*}}, %zmm1
+; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+;
+; AVX-LABEL: f16xf32_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm2
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+ %res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
+ %res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
+ ret <16 x float> %res2
+}
+
+
+; ALL: .LCPI33
+; ALL-NEXT: .long 1082130432 # float 4
+; ALL-NEXT: .long 1065353216 # float 1
+; ALL-NEXT: .long 1073741824 # float 2
+; ALL-NEXT: .long 1077936128 # float 3
+; ALL-NOT: .long
+
+define <16 x float> @f16xf32_f128(<16 x float> %a) {
+; AVX2-LABEL: f16xf32_f128:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+;
+; AVX512-LABEL: f16xf32_f128:
+; AVX512: # BB#0:
+; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+;
+; AVX-LABEL: f16xf32_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+ %res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
+ %res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
+ ret <16 x float> %res2
+}
+
+
+; AVX512: .LCPI34
+; AVX512-NEXT: .long 1090519040 # float 8
+; AVX512-NEXT: .long 1065353216 # float 1
+; AVX512-NEXT: .long 1073741824 # float 2
+; AVX512-NEXT: .long 1077936128 # float 3
+; AVX512-NEXT: .long 1082130432 # float 4
+; AVX512-NEXT: .long 1084227584 # float 5
+; AVX512-NEXT: .long 1086324736 # float 6
+; AVX512-NEXT: .long 1088421888 # float 7
+; AVX512-NOT: .long
+
+define <16 x float> @f16xf32_f256(<16 x float> %a) {
+; AVX512-LABEL: f16xf32_f256:
+; AVX512: # BB#0:
+; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+ %res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a
+ %res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1
+ ret <16 x float> %res2
+}
+
+
+; ALL: .LCPI35
+; ALL-NEXT: .quad 4611686018427387904 # double 2
+; ALL-NEXT: .quad 4607182418800017408 # double 1
+; ALL-NOT: .quad
+
+define <4 x double> @f4xf64_f128(<4 x double> %a) {
+; ALL-LABEL: f4xf64_f128:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+;
+; AVX-LABEL: f4xf64_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+ %res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a
+ %res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1
+ ret <4 x double> %res2
+}
+
+
+; ALL: .LCPI36
+; ALL-NEXT: .quad 4611686018427387904 # double 2
+; ALL-NEXT: .quad 4607182418800017408 # double 1
+; ALL-NOT: .quad
+
+define <8 x double> @f8xf64_f128(<8 x double> %a) {
+; AVX2-LABEL: f8xf64_f128:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+;
+; AVX512-LABEL: f8xf64_f128:
+; AVX512: # BB#0:
+; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+;
+; AVX-LABEL: f8xf64_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+ %res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a
+ %res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1
+ ret <8 x double> %res2
+}
+
+
+; AVX512: .LCPI37
+; AVX512-NEXT: .quad 4616189618054758400 # double 4
+; AVX512-NEXT: .quad 4607182418800017408 # double 1
+; AVX512-NEXT: .quad 4611686018427387904 # double 2
+; AVX512-NEXT: .quad 4613937818241073152 # double 3
+; AVX512-NOT: .quad
+
+define <8 x double> @f8xf64_f256(<8 x double> %a) {
+; AVX512-LABEL: f8xf64_f256:
+; AVX512: # BB#0:
+; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+ %res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a
+ %res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1
+ ret <8 x double> %res2
+}
+
Modified: llvm/trunk/test/CodeGen/X86/vec_shift6.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_shift6.ll?rev=288804&r1=288803&r2=288804&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_shift6.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_shift6.ll Tue Dec 6 06:24:14 2016
@@ -153,14 +153,14 @@ define <32 x i16> @test7(<32 x i16> %a)
;
; AVX2-LABEL: test7:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test7:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512-NEXT: retq
@@ -183,7 +183,7 @@ define <16 x i32> @test8(<16 x i32> %a)
;
; AVX2-LABEL: test8:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll?rev=288804&r1=288803&r2=288804&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v64.ll Tue Dec 6 06:24:14 2016
@@ -466,7 +466,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61
define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
@@ -482,7 +482,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61
; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512BW-NEXT: vpbroadcastw {{.*}}(%rip), %ymm3
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
@@ -498,7 +498,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61
;
; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512DQ-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
More information about the llvm-commits
mailing list