[llvm] r330322 - Lowering x86 adds/addus/subs/subus intrinsics (llvm part)

Thu Apr 26 14:45:04 PDT 2018

FYI, reverting due to crashes. Updated review thread and filed
http://llvm.org/PR37260 with testcase.

On Thu, Apr 19, 2018 at 5:16 AM Alexander Ivchenko via llvm-commits <
llvm-commits at lists.llvm.org> wrote:

> Author: aivchenk
> Date: Thu Apr 19 05:13:30 2018
> New Revision: 330322
>
> URL: http://llvm.org/viewvc/llvm-project?rev=330322&view=rev
> Log:
> Lowering x86 adds/addus/subs/subus intrinsics (llvm part)
>
> This is the patch that lowers x86 intrinsics to native IR
> in order to enable optimizations. The patch also includes folding
> of previously missing saturation patterns so that IR emits the same
> machine instructions as the intrinsics.
>
> Patch by tkrupa
>
> Differential Revision: https://reviews.llvm.org/D44785
>
> Added:
>     llvm/trunk/test/CodeGen/X86/vector-arith-sat.ll
> Modified:
>     llvm/trunk/include/llvm/IR/IntrinsicsX86.td
>     llvm/trunk/lib/IR/AutoUpgrade.cpp
>     llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>     llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
>     llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
>     llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
>     llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll
>     llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
>     llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
>     llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
>     llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
>     llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
>     llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
>     llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll
>     llvm/trunk/test/CodeGen/X86/sse2-schedule.ll
>     llvm/trunk/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll
>
> Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
> +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Thu Apr 19 05:13:30 2018
> @@ -378,30 +378,6 @@ let TargetPrefix = "x86" in {  // All in
>
>  // Integer arithmetic ops.
>  let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
> -  def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">,
> -              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
> -                         llvm_v16i8_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">,
> -              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
> -                         llvm_v8i16_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_sse2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb128">,
> -              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
> -                         llvm_v16i8_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_sse2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw128">,
> -              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
> -                         llvm_v8i16_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">,
> -              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
> -                         llvm_v16i8_ty], [IntrNoMem]>;
> -  def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">,
> -              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
> -                         llvm_v8i16_ty], [IntrNoMem]>;
> -  def int_x86_sse2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb128">,
> -              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
> -                         llvm_v16i8_ty], [IntrNoMem]>;
> -  def int_x86_sse2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw128">,
> -              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
> -                         llvm_v8i16_ty], [IntrNoMem]>;
>    def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">,
>                Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
>                           llvm_v8i16_ty], [IntrNoMem, Commutative]>;
> @@ -1627,30 +1603,6 @@ let TargetPrefix = "x86" in {  // All in
>
>  // Integer arithmetic ops.
>  let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
> -  def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">,
> -              Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
> -                         llvm_v32i8_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">,
> -              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
> -                         llvm_v16i16_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_avx2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb256">,
> -              Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
> -                         llvm_v32i8_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_avx2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw256">,
> -              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
> -                         llvm_v16i16_ty], [IntrNoMem, Commutative]>;
> -  def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">,
> -              Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
> -                         llvm_v32i8_ty], [IntrNoMem]>;
> -  def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">,
> -              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
> -                         llvm_v16i16_ty], [IntrNoMem]>;
> -  def int_x86_avx2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb256">,
> -              Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
> -                         llvm_v32i8_ty], [IntrNoMem]>;
> -  def int_x86_avx2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw256">,
> -              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
> -                         llvm_v16i16_ty], [IntrNoMem]>;
>    def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">,
>                Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
>                           llvm_v16i16_ty], [IntrNoMem, Commutative]>;
> @@ -4695,78 +4647,6 @@ let TargetPrefix = "x86" in {  // All in
>  }
>  // Integer arithmetic ops
>  let TargetPrefix = "x86" in {
> -  def int_x86_avx512_mask_padds_b_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
> -                     llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_padds_b_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
> -                     llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_padds_b_512 :
> GCCBuiltin<"__builtin_ia32_paddsb512_mask">,
> -          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
> -                     llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_padds_w_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
> -                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_padds_w_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
> -                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_padds_w_512 :
> GCCBuiltin<"__builtin_ia32_paddsw512_mask">,
> -          Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
> -                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_paddus_b_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
> -                     llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_paddus_b_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
> -                     llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_paddus_b_512 :
> GCCBuiltin<"__builtin_ia32_paddusb512_mask">,
> -          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
> -                     llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_paddus_w_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
> -                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_paddus_w_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
> -                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_paddus_w_512 :
> GCCBuiltin<"__builtin_ia32_paddusw512_mask">,
> -          Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
> -                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubs_b_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
> -                     llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubs_b_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
> -                     llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubs_b_512 :
> GCCBuiltin<"__builtin_ia32_psubsb512_mask">,
> -          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
> -                     llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubs_w_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
> -                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubs_w_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
> -                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubs_w_512 :
> GCCBuiltin<"__builtin_ia32_psubsw512_mask">,
> -          Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
> -                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubus_b_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
> -                     llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubus_b_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
> -                     llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubus_b_512 :
> GCCBuiltin<"__builtin_ia32_psubusb512_mask">,
> -          Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
> -                     llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubus_w_128 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
> -                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubus_w_256 : // FIXME: remove this intrinsic
> -          Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
> -                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
> -  def int_x86_avx512_mask_psubus_w_512 :
> GCCBuiltin<"__builtin_ia32_psubusw512_mask">,
> -          Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
> -                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
>    def int_x86_avx512_pmulhu_w_512 :
> GCCBuiltin<"__builtin_ia32_pmulhuw512">,
>                Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
>                           llvm_v32i16_ty], [IntrNoMem, Commutative]>;
>
> Modified: llvm/trunk/lib/IR/AutoUpgrade.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/IR/AutoUpgrade.cpp?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/IR/AutoUpgrade.cpp (original)
> +++ llvm/trunk/lib/IR/AutoUpgrade.cpp Thu Apr 19 05:13:30 2018
> @@ -84,7 +84,19 @@ static bool ShouldUpgradeX86Intrinsic(Fu
>    // like to use this information to remove upgrade code for some older
>    // intrinsics. It is currently undecided how we will determine that
> future
>    // point.
> -  if (Name=="ssse3.pabs.b.128" || // Added in 6.0
> +  if (Name.startswith("sse2.padds") || // Added in 7.0
> +      Name.startswith("sse2.paddus") || // Added in 7.0
> +      Name.startswith("sse2.psubs") || // Added in 7.0
> +      Name.startswith("sse2.psubus") || // Added in 7.0
> +      Name.startswith("avx2.padds") || // Added in 7.0
> +      Name.startswith("avx2.paddus") || // Added in 7.0
> +      Name.startswith("avx2.psubs") || // Added in 7.0
> +      Name.startswith("avx2.psubus") || // Added in 7.0
> +      Name.startswith("avx512.mask.padds") || // Added in 7.0
> +      Name.startswith("avx512.mask.paddus") || // Added in 7.0
> +      Name.startswith("avx512.mask.psubs") || // Added in 7.0
> +      Name.startswith("avx512.mask.psubus") || // Added in 7.0
> +      Name=="ssse3.pabs.b.128" || // Added in 6.0
>        Name=="ssse3.pabs.w.128" || // Added in 6.0
>        Name=="ssse3.pabs.d.128" || // Added in 6.0
>        Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
> @@ -845,6 +857,77 @@ static Value *UpgradeX86ALIGNIntrinsics(
>    return EmitX86Select(Builder, Mask, Align, Passthru);
>  }
>
> +static Value *UpgradeX86AddSubSatIntrinsics(IRBuilder<> &Builder,
> CallInst &CI,
> +                                            bool IsSigned, bool
> IsAddition) {
> +  // Get elements.
> +  Value *Op0 = CI.getArgOperand(0);
> +  Value *Op1 = CI.getArgOperand(1);
> +
> +  // Extend elements.
> +  Type *ResultType = CI.getType();
> +  unsigned NumElts = ResultType->getVectorNumElements();
> +
> +  Value *Res;
> +  if (!IsAddition && !IsSigned) {
> +    Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Op0, Op1);
> +    Value *Select = Builder.CreateSelect(ICmp, Op0, Op1);
> +    Res = Builder.CreateSub(Select, Op1);
> +  } else {
> +    Type *EltType = ResultType->getVectorElementType();
> +    Type *ExtEltType = EltType == Builder.getInt8Ty() ?
> Builder.getInt16Ty()
> +                                                      :
> Builder.getInt32Ty();
> +    Type *ExtVT = VectorType::get(ExtEltType, NumElts);
> +    Op0 = IsSigned ? Builder.CreateSExt(Op0, ExtVT)
> +                   : Builder.CreateZExt(Op0, ExtVT);
> +    Op1 = IsSigned ? Builder.CreateSExt(Op1, ExtVT)
> +                   : Builder.CreateZExt(Op1, ExtVT);
> +
> +    // Perform addition/substraction.
> +    Res = IsAddition ? Builder.CreateAdd(Op0, Op1)
> +                     : Builder.CreateSub(Op0, Op1);
> +
> +    // Create a vector of maximum values of not extended type
> +    // (if overflow occurs, it will be saturated to that value).
> +    unsigned EltSizeInBits = EltType->getPrimitiveSizeInBits();
> +    APInt MaxInt = IsSigned ? APInt::getSignedMaxValue(EltSizeInBits)
> +                            : APInt::getMaxValue(EltSizeInBits);
> +    Value *MaxVec = ConstantInt::get(ResultType, MaxInt);
> +    // Extend so that it can be compared to result of add/sub.
> +    MaxVec = IsSigned ? Builder.CreateSExt(MaxVec, ExtVT)
> +                      : Builder.CreateZExt(MaxVec, ExtVT);
> +
> +    // Saturate overflow.
> +    ICmpInst::Predicate Pred = IsSigned ? ICmpInst::ICMP_SLE
> +                                        : ICmpInst::ICMP_ULE;
> +    Value *Cmp = Builder.CreateICmp(Pred, Res,
> +                                    MaxVec); // 1 if no overflow.
> +    Res = Builder.CreateSelect(Cmp, Res,
> +                               MaxVec); // If overflowed, copy from max
> vec.
> +
> +    // Saturate underflow.
> +    if (IsSigned) {
> +      APInt MinInt = APInt::getSignedMinValue(EltSizeInBits);
> +      Value *MinVec = ConstantInt::get(ResultType, MinInt);
> +      // Extend so that it can be compared to result of add/sub.
> +      MinVec = Builder.CreateSExt(MinVec, ExtVT);
> +      Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Res,
> +                                      MinVec); // 1 if no underflow.
> +      Res = Builder.CreateSelect(Cmp, Res,
> +                                 MinVec); // If underflowed, copy from
> min vec.
> +    }
> +
> +    // Truncate to original type.
> +    Res = Builder.CreateTrunc(Res, ResultType);
> +  }
> +
> +  if (CI.getNumArgOperands() == 4) { // For masked intrinsics.
> +    Value *VecSRC = CI.getArgOperand(2);
> +    Value *Mask = CI.getArgOperand(3);
> +    Res = EmitX86Select(Builder, Mask, Res, VecSRC);
> +  }
> +  return Res;
> +}
> +
>  static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
>                                   Value *Ptr, Value *Data, Value *Mask,
>                                   bool Aligned) {
> @@ -1684,6 +1767,26 @@ void llvm::UpgradeIntrinsicCall(CallInst
>                                          ShuffleMask);
>        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
>                            CI->getArgOperand(1));
> +    } else if (IsX86 && (Name.startswith("sse2.padds") ||
> +                         Name.startswith("avx2.padds") ||
> +                         Name.startswith("avx512.mask.padds"))) {
> +      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI,
> +                                          true, true); // Signed add.
> +    } else if (IsX86 && (Name.startswith("sse2.paddus") ||
> +                         Name.startswith("avx2.paddus") ||
> +                         Name.startswith("avx512.mask.paddus"))) {
> +      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI,
> +                                          false, true); // Unsigned add.
> +    } else if (IsX86 && (Name.startswith("sse2.psubs") ||
> +                         Name.startswith("avx2.psubs") ||
> +                         Name.startswith("avx512.mask.psubs"))) {
> +      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI,
> +                                          true, false); // Signed sub.
> +    } else if (IsX86 && (Name.startswith("sse2.psubus") ||
> +                         Name.startswith("avx2.psubus") ||
> +                         Name.startswith("avx512.mask.psubus"))) {
> +      Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI,
> +                                          false, false); // Unsigned sub.
>      } else if (IsX86 && (Name.startswith("avx2.pbroadcast") ||
>                           Name.startswith("avx2.vbroadcast") ||
>                           Name.startswith("avx512.pbroadcast") ||
> @@ -1694,7 +1797,6 @@ void llvm::UpgradeIntrinsicCall(CallInst
>        Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
>        Rep = Builder.CreateShuffleVector(Op,
> UndefValue::get(Op->getType()),
>                                          Constant::getNullValue(MaskTy));
> -
>        if (CI->getNumArgOperands() == 3)
>          Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
>                              CI->getArgOperand(1));
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Apr 19 05:13:30 2018
> @@ -35997,6 +35997,91 @@ static SDValue combineVectorSignBitsTrun
>    return SDValue();
>  }
>
> +/// This function detects the addition or substraction with saturation
> pattern
> +/// between 2 unsigned i8/i16 vectors and replace this operation with the
> +/// efficient X86ISD::ADDUS/X86ISD::ADDS/X86ISD::SUBUS/x86ISD::SUBS
> instruction.
> +static SDValue detectAddSubSatPattern(SDValue In, EVT VT, SelectionDAG
> &DAG,
> +                                      const X86Subtarget &Subtarget,
> +                                      const SDLoc &DL) {
> +  if (!VT.isVector() || !VT.isSimple())
> +    return SDValue();
> +  EVT InVT = In.getValueType();
> +  unsigned NumElems = VT.getVectorNumElements();
> +
> +  EVT ScalarVT = VT.getVectorElementType();
> +  if ((ScalarVT != MVT::i8 && ScalarVT != MVT::i16) ||
> +      InVT.getSizeInBits() % 128 != 0 || !isPowerOf2_32(NumElems))
> +    return SDValue();
> +
> +  // InScalarVT is the intermediate type in AddSubSat pattern
> +  // and it should be greater than the original input type (i8/i16).
> +  EVT InScalarVT = InVT.getVectorElementType();
> +  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
> +    return SDValue();
> +
> +  if (!Subtarget.hasSSE2())
> +    return SDValue();
> +
> +  // Detect the following pattern:
> +  // %2 = zext <16 x i8> %0 to <16 x i16>
> +  // %3 = zext <16 x i8> %1 to <16 x i16>
> +  // %4 = add nuw nsw <16 x i16> %3, %2
> +  // %5 = icmp ult <16 x i16> %4, <16 x i16> (vector of max InScalarVT
> values)
> +  // %6 = select <16 x i1> %5, <16 x i16> (vector of max InScalarVT
> values)
> +  // %7 = trunc <16 x i16> %6 to <16 x i8>
> +
> +  // Detect a Sat Pattern
> +  bool Signed = true;
> +  SDValue Sat = detectSSatPattern(In, VT, false);
> +  if (!Sat) {
> +    Sat = detectUSatPattern(In, VT);
> +    Signed = false;
> +  }
> +  if (!Sat)
> +    return SDValue();
> +  if (Sat.getOpcode() != ISD::ADD && Sat.getOpcode() != ISD::SUB)
> +    return SDValue();
> +
> +  unsigned Opcode = Sat.getOpcode() == ISD::ADD ? Signed ? X86ISD::ADDS
> +                                                         : X86ISD::ADDUS
> +                                                : Signed ? X86ISD::SUBS
> +                                                         : X86ISD::SUBUS;
> +
> +  // Get addition elements.
> +  SDValue LHS = Sat.getOperand(0);
> +  SDValue RHS = Sat.getOperand(1);
> +
> +  // Check if LHS and RHS are results of type promotion or
> +  // one of them is and the other one is constant.
> +  unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND :
> +                                   ISD::ZERO_EXTEND;
> +  unsigned LHSOpcode = LHS.getOpcode();
> +  unsigned RHSOpcode = RHS.getOpcode();
> +
> +  if (LHSOpcode == ExtendOpcode && RHSOpcode == ExtendOpcode) {
> +    LHS = LHS.getOperand(0);
> +    RHS = RHS.getOperand(0);
> +  } else if (LHSOpcode == ExtendOpcode &&
> +             ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
> +    LHS = LHS.getOperand(0);
> +    RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
> +  } else if (RHSOpcode == ExtendOpcode &&
> +           ISD::isBuildVectorOfConstantSDNodes(LHS.getNode())) {
> +    RHS = RHS.getOperand(0);
> +    LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
> +  } else
> +    return SDValue();
> +
> +  // The pattern is detected, emit ADDS/ADDUS/SUBS/SUBUS instruction.
> +  auto AddSubSatBuilder = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
> +                                   ArrayRef<SDValue> Ops) {
> +    EVT VT = Ops[0].getValueType();
> +    return DAG.getNode(Opcode, DL, VT, Ops);
> +  };
> +  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { LHS, RHS },
> +                          AddSubSatBuilder);
> +}
> +
>  static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
>                                 const X86Subtarget &Subtarget) {
>    EVT VT = N->getValueType(0);
> @@ -36011,6 +36096,10 @@ static SDValue combineTruncate(SDNode *N
>    if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
>      return Avg;
>
> +  // Try to detect addition or substraction with saturation.
> +  if (SDValue AddSubSat = detectAddSubSatPattern(Src, VT, DAG, Subtarget,
> DL))
> +    return AddSubSat;
> +
>    // Try to combine truncation with signed/unsigned saturation.
>    if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
>      return Val;
>
> Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
> +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Thu Apr 19 05:13:30 2018
> @@ -402,10 +402,6 @@ static const IntrinsicData  IntrinsicsWi
>    X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
>    X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
>    X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
> -  X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
> -  X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
>    X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
>    X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
>    X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
> @@ -444,10 +440,6 @@ static const IntrinsicData  IntrinsicsWi
>    X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
>    X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
>    X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
> -  X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
> -  X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
>    X86_INTRINSIC_DATA(avx512_cvtsi2sd64,  INTR_TYPE_3OP,
> X86ISD::SCALAR_SINT_TO_FP_RND, 0),
>    X86_INTRINSIC_DATA(avx512_cvtsi2ss32,  INTR_TYPE_3OP,
> X86ISD::SCALAR_SINT_TO_FP_RND, 0),
>    X86_INTRINSIC_DATA(avx512_cvtsi2ss64,  INTR_TYPE_3OP,
> X86ISD::SCALAR_SINT_TO_FP_RND, 0),
> @@ -803,18 +795,6 @@ static const IntrinsicData  IntrinsicsWi
>                       X86ISD::FMULS_RND, 0),
>    X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
>                       X86ISD::FMULS_RND, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK,
> X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK,
> X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK,
> X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK,
> X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK,
> X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK,
> X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK,
> X86ISD::ADDUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK,
> X86ISD::ADDUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK,
> X86ISD::ADDUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK,
> X86ISD::ADDUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK,
> X86ISD::ADDUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK,
> X86ISD::ADDUS, 0),
>    X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
>                       X86ISD::VPERMV, 0),
>    X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
> @@ -981,18 +961,6 @@ static const IntrinsicData  IntrinsicsWi
>    X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK,
> ISD::ROTR, 0),
>    X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK,
> ISD::ROTR, 0),
>    X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK,
> ISD::ROTR, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK,
> X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK,
> X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK,
> X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK,
> X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK,
> X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK,
> X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK,
> X86ISD::SUBUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK,
> X86ISD::SUBUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK,
> X86ISD::SUBUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK,
> X86ISD::SUBUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK,
> X86ISD::SUBUS, 0),
> -  X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK,
> X86ISD::SUBUS, 0),
>    X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
>                       X86ISD::VPTERNLOG, 0),
>    X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
> @@ -1602,10 +1570,6 @@ static const IntrinsicData  IntrinsicsWi
>    X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
>    X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
>    X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
> -  X86_INTRINSIC_DATA(sse2_padds_b,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(sse2_padds_w,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
> -  X86_INTRINSIC_DATA(sse2_paddus_b,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
> -  X86_INTRINSIC_DATA(sse2_paddus_w,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
>    X86_INTRINSIC_DATA(sse2_pmadd_wd,     INTR_TYPE_2OP, X86ISD::VPMADDWD,
> 0),
>    X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
>    X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
> @@ -1627,10 +1591,6 @@ static const IntrinsicData  IntrinsicsWi
>    X86_INTRINSIC_DATA(sse2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
>    X86_INTRINSIC_DATA(sse2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
>    X86_INTRINSIC_DATA(sse2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
> -  X86_INTRINSIC_DATA(sse2_psubs_b,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
> -  X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
> -  X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
>    X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
>    X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
>    X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
>
> Modified: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll Thu Apr 19
> 05:13:30 2018
> @@ -98,11 +98,17 @@ define <4 x i64> @test_mm256_adds_epi8(<
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
>    %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
> -  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8>
> %arg1)
> -  %bc = bitcast <32 x i8> %res to <4 x i64>
> +  %1 = sext <32 x i8> %arg0 to <32 x i16>
> +  %2 = sext <32 x i8> %arg1 to <32 x i16>
> +  %3 = add nsw <32 x i16> %1, %2
> +  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127>
> +  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127,
> i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127>
> +  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
> +  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128>
> +  %8 = trunc <32 x i16> %7 to <32 x i8>
> +  %bc = bitcast <32 x i8> %8 to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind
> readnone
>
>  define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
>  ; CHECK-LABEL: test_mm256_adds_epi16:
> @@ -111,11 +117,17 @@ define <4 x i64> @test_mm256_adds_epi16(
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
>    %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
> -  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x
> i16> %arg1)
> -  %bc = bitcast <16 x i16> %res to <4 x i64>
> +  %1 = sext <16 x i16> %arg0 to <16 x i32>
> +  %2 = sext <16 x i16> %arg1 to <16 x i32>
> +  %3 = add nsw <16 x i32> %1, %2
> +  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32
> 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767,
> i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
> +  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32
> 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767,
> i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32
> 32767, i32 32767>
> +  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768>
> +  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768>
> +  %8 = trunc <16 x i32> %7 to <16 x i16>
> +  %bc = bitcast <16 x i16> %8 to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>)
> nounwind readnone
>
>  define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
>  ; CHECK-LABEL: test_mm256_adds_epu8:
> @@ -124,11 +136,15 @@ define <4 x i64> @test_mm256_adds_epu8(<
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
>    %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
> -  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x
> i8> %arg1)
> -  %bc = bitcast <32 x i8> %res to <4 x i64>
> +  %1 = zext <32 x i8> %arg0 to <32 x i16>
> +  %2 = zext <32 x i8> %arg1 to <32 x i16>
> +  %3 = add nsw <32 x i16> %1, %2
> +  %4 = icmp ult <32 x i16> %3, <i16 255, i16 255, i16 255, i16 255, i16
> 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16
> 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16
> 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16
> 255, i16 255, i16 255, i16 255>
> +  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 255, i16 255,
> i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16
> 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16
> 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16
> 255, i16 255, i16 255, i16 255, i16 255, i16 255>
> +  %6 = trunc <32 x i16> %5 to <32 x i8>
> +  %bc = bitcast <32 x i8> %6 to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind
> readnone
>
>  define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
>  ; CHECK-LABEL: test_mm256_adds_epu16:
> @@ -137,11 +153,15 @@ define <4 x i64> @test_mm256_adds_epu16(
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
>    %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
> -  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x
> i16> %arg1)
> -  %bc = bitcast <16 x i16> %res to <4 x i64>
> +  %1 = zext <16 x i16> %arg0 to <16 x i32>
> +  %2 = zext <16 x i16> %arg1 to <16 x i32>
> +  %3 = add nsw <16 x i32> %1, %2
> +  %4 = icmp ult <16 x i32> %3, <i32 65535, i32 65535, i32 65535, i32
> 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535,
> i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
> +  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 65535, i32
> 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535,
> i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32
> 65535, i32 65535>
> +  %6 = trunc <16 x i32> %5 to <16 x i16>
> +  %bc = bitcast <16 x i16> %6 to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>)
> nounwind readnone
>
>  define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
>  ; CHECK-LABEL: test_mm256_alignr_epi8:
> @@ -2529,11 +2549,17 @@ define <4 x i64> @test_mm256_subs_epi8(<
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
>    %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
> -  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8>
> %arg1)
> -  %bc = bitcast <32 x i8> %res to <4 x i64>
> +  %1 = sext <32 x i8> %arg0 to <32 x i16>
> +  %2 = sext <32 x i8> %arg1 to <32 x i16>
> +  %3 = sub nsw <32 x i16> %1, %2
> +  %4 = icmp slt <32 x i16> %3, <i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127>
> +  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> <i16 127, i16 127,
> i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16
> 127, i16 127, i16 127, i16 127, i16 127, i16 127>
> +  %6 = icmp sgt <32 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
> +  %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> <i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128,
> i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16
> -128>
> +  %8 = trunc <32 x i16> %7 to <32 x i8>
> +  %bc = bitcast <32 x i8> %8 to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind
> readnone
>
>  define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
>  ; CHECK-LABEL: test_mm256_subs_epi16:
> @@ -2542,37 +2568,47 @@ define <4 x i64> @test_mm256_subs_epi16(
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
>    %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
> -  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x
> i16> %arg1)
> -  %bc = bitcast <16 x i16> %res to <4 x i64>
> +  %1 = sext <16 x i16> %arg0 to <16 x i32>
> +  %2 = sext <16 x i16> %arg1 to <16 x i32>
> +  %3 = sub nsw <16 x i32> %1, %2
> +  %4 = icmp slt <16 x i32> %3, <i32 32767, i32 32767, i32 32767, i32
> 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767,
> i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
> +  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> <i32 32767, i32
> 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767,
> i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32
> 32767, i32 32767>
> +  %6 = icmp sgt <16 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768>
> +  %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32
> -32768, i32 -32768, i32 -32768>
> +  %8 = trunc <16 x i32> %7 to <16 x i16>
> +  %bc = bitcast <16 x i16> %8 to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>)
> nounwind readnone
>
>  define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
>  ; CHECK-LABEL: test_mm256_subs_epu8:
>  ; CHECK:       # %bb.0:
> -; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
> +; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
> +; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
>    %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
> -  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x
> i8> %arg1)
> -  %bc = bitcast <32 x i8> %res to <4 x i64>
> +  %cmp = icmp ugt <32 x i8> %arg0, %arg1
> +  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
> +  %sub = sub <32 x i8> %sel, %arg1
> +  %bc = bitcast <32 x i8> %sub to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind
> readnone
>
>  define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
>  ; CHECK-LABEL: test_mm256_subs_epu16:
>  ; CHECK:       # %bb.0:
> -; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
> +; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
> +; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
>  ; CHECK-NEXT:    ret{{[l|q]}}
>    %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
>    %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
> -  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x
> i16> %arg1)
> -  %bc = bitcast <16 x i16> %res to <4 x i64>
> +  %cmp = icmp ugt <16 x i16> %arg0, %arg1
> +  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
> +  %sub = sub <16 x i16> %sel, %arg1
> +  %bc = bitcast <16 x i16> %sub to <4 x i64>
>    ret <4 x i64> %bc
>  }
> -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>)
> nounwind readnone
>
>  define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1)
> nounwind {
>  ; CHECK-LABEL: test_mm256_unpackhi_epi8:
>
> Modified: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll Thu Apr 19
> 05:13:30 2018
> @@ -848,6 +848,133 @@ define <4 x i64> @test_x86_avx2_vperm2i1
>  declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8)
> nounwind readonly
>
>
> +define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
> +; AVX2-LABEL: test_x86_avx2_padds_b:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_padds_b:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xec,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> +  ret <32 x i8> %res
> +}
> +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> +
> +
> +define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
> +; AVX2-LABEL: test_x86_avx2_padds_w:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_padds_w:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xed,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> +  ret <16 x i16> %res
> +}
> +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> +
> +
> +define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
> +; AVX2-LABEL: test_x86_avx2_paddus_b:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_paddus_b:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xdc,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> +  ret <32 x i8> %res
> +}
> +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> +
> +
> +define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1)
> {
> +; AVX2-LABEL: test_x86_avx2_paddus_w:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_paddus_w:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xdd,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> +  ret <16 x i16> %res
> +}
> +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> +
> +
> +define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
> +; AVX2-LABEL: test_x86_avx2_psubs_b:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_psubs_b:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xe8,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> +  ret <32 x i8> %res
> +}
> +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> +
> +
> +define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
> +; AVX2-LABEL: test_x86_avx2_psubs_w:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_psubs_w:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xe9,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> +  ret <16 x i16> %res
> +}
> +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> +
> +
> +define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
> +; AVX2-LABEL: test_x86_avx2_psubus_b:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_psubus_b:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xd8,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> +  ret <32 x i8> %res
> +}
> +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> +
> +
> +define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1)
> {
> +; AVX2-LABEL: test_x86_avx2_psubus_w:
> +; AVX2:       ## %bb.0:
> +; AVX2-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
> +; AVX2-NEXT:    ret{{[l|q]}}
> +;
> +; AVX512VL-LABEL: test_x86_avx2_psubus_w:
> +; AVX512VL:       ## %bb.0:
> +; AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xd9,0xc1]
> +; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
> +  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> +  ret <16 x i16> %res
> +}
> +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> +
>  define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) {
>  ; X86-LABEL: test_x86_avx2_pmulu_dq:
>  ; X86:       ## %bb.0:
>
> Modified: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll Thu Apr 19 05:13:30
> 2018
> @@ -181,110 +181,6 @@ define <32 x i8> @test_x86_avx2_packuswb
>  }
>
>
> -define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
> -; X86-AVX-LABEL: test_x86_avx2_padds_b:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xec,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_padds_b:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xec,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_padds_b:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xec,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_padds_b:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xec,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> -  ret <32 x i8> %res
> -}
> -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> -
> -
> -define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
> -; X86-AVX-LABEL: test_x86_avx2_padds_w:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xed,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_padds_w:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xed,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_padds_w:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xed,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_padds_w:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xed,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> -  ret <16 x i16> %res
> -}
> -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> -
> -
> -define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
> -; X86-AVX-LABEL: test_x86_avx2_paddus_b:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xdc,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xdc,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_paddus_b:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xdc,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xdc,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> -  ret <32 x i8> %res
> -}
> -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> -
> -
> -define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1)
> {
> -; X86-AVX-LABEL: test_x86_avx2_paddus_w:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xdd,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xdd,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_paddus_w:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xdd,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xdd,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> -  ret <16 x i16> %res
> -}
> -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> -
> -
>  define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
>  ; X86-AVX-LABEL: test_x86_avx2_pmadd_wd:
>  ; X86-AVX:       ## %bb.0:
> @@ -927,109 +823,6 @@ define <16 x i16> @test_x86_avx2_psrli_w
>  declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind
> readnone
>
>
> -define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
> -; X86-AVX-LABEL: test_x86_avx2_psubs_b:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xe8,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xe8,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_psubs_b:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xe8,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xe8,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> -  ret <32 x i8> %res
> -}
> -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> -
> -
> -define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
> -; X86-AVX-LABEL: test_x86_avx2_psubs_w:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xe9,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xe9,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_psubs_w:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xe9,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xe9,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> -  ret <16 x i16> %res
> -}
> -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> -
> -
> -define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
> -; X86-AVX-LABEL: test_x86_avx2_psubus_b:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xd8,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xd8,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_psubus_b:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xd8,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xd8,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8>
> %a1) ; <<32 x i8>> [#uses=1]
> -  ret <32 x i8> %res
> -}
> -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind
> readnone
> -
> -
> -define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1)
> {
> -; X86-AVX-LABEL: test_x86_avx2_psubus_w:
> -; X86-AVX:       ## %bb.0:
> -; X86-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xd9,0xc1]
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w:
> -; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xd9,0xc1]
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> -;
> -; X64-AVX-LABEL: test_x86_avx2_psubus_w:
> -; X64-AVX:       ## %bb.0:
> -; X64-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding:
> [0xc5,0xfd,0xd9,0xc1]
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> -;
> -; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w:
> -; X64-AVX512VL:       ## %bb.0:
> -; X64-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc5,0xfd,0xd9,0xc1]
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> -  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x
> i16> %a1) ; <<16 x i16>> [#uses=1]
> -  ret <16 x i16> %res
> -}
> -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>)
> nounwind readnone
> -
>  define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
>  ; X86-LABEL: test_x86_avx2_phadd_d:
>  ; X86:       ## %bb.0:
> @@ -1330,29 +1123,29 @@ define <16 x i16> @test_x86_avx2_packusd
>  ; X86-AVX:       ## %bb.0:
>  ; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 =
> [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
>  ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
> -; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI54_0, kind:
> FK_Data_4
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> +; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI46_0, kind:
> FK_Data_4
> +; X86-AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
>  ; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX
> Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
> +; X86-AVX512VL-NEXT:    vmovaps LCPI46_0, %ymm0 ## EVEX TO VEX
> Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
>  ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
> -; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI54_0, kind:
> FK_Data_4
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> +; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI46_0, kind:
> FK_Data_4
> +; X86-AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold:
>  ; X64-AVX:       ## %bb.0:
>  ; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 =
> [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
>  ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
> -; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI54_0-4, kind:
> reloc_riprel_4byte
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> +; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI46_0-4, kind:
> reloc_riprel_4byte
> +; X64-AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
>  ; X64-AVX512VL:       ## %bb.0:
>  ; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX
> Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
>  ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
> -; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI54_0-4, kind:
> reloc_riprel_4byte
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> +; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI46_0-4, kind:
> reloc_riprel_4byte
> +; X64-AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>    %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>
> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32
> -32767, i32 -65535, i32 0, i32 -256>)
>    ret <16 x i16> %res
>  }
> @@ -2071,37 +1864,37 @@ define <4 x i32> @test_x86_avx2_psrav_d_
>  ; X86-AVX:       ## %bb.0:
>  ; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
>  ; X86-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
> -; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind:
> FK_Data_4
> -; X86-AVX-NEXT:    vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding:
> [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
> -; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind:
> FK_Data_4
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> +; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI78_0, kind:
> FK_Data_4
> +; X86-AVX-NEXT:    vpsravd LCPI78_1, %xmm0, %xmm0 ## encoding:
> [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
> +; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI78_1, kind:
> FK_Data_4
> +; X86-AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
>  ; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX
> Compression xmm0 = [2,9,4294967284,23]
> +; X86-AVX512VL-NEXT:    vmovdqa LCPI78_0, %xmm0 ## EVEX TO VEX
> Compression xmm0 = [2,9,4294967284,23]
>  ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
> -; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind:
> FK_Data_4
> -; X86-AVX512VL-NEXT:    vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX
> Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
> -; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind:
> FK_Data_4
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> +; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI78_0, kind:
> FK_Data_4
> +; X86-AVX512VL-NEXT:    vpsravd LCPI78_1, %xmm0, %xmm0 ## EVEX TO VEX
> Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
> +; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI78_1, kind:
> FK_Data_4
> +; X86-AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const:
>  ; X64-AVX:       ## %bb.0:
>  ; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
>  ; X64-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
> -; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI86_0-4, kind:
> reloc_riprel_4byte
> +; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI78_0-4, kind:
> reloc_riprel_4byte
>  ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding:
> [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
> -; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI86_1-4, kind:
> reloc_riprel_4byte
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> +; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI78_1-4, kind:
> reloc_riprel_4byte
> +; X64-AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
>  ; X64-AVX512VL:       ## %bb.0:
>  ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX
> Compression xmm0 = [2,9,4294967284,23]
>  ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
> -; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0-4, kind:
> reloc_riprel_4byte
> +; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI78_0-4, kind:
> reloc_riprel_4byte
>  ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX
> Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
> -; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1-4, kind:
> reloc_riprel_4byte
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> +; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI78_1-4, kind:
> reloc_riprel_4byte
> +; X64-AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>    %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9,
> i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
>    ret <4 x i32> %res
>  }
> @@ -2136,37 +1929,37 @@ define <8 x i32> @test_x86_avx2_psrav_d_
>  ; X86-AVX:       ## %bb.0:
>  ; X86-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 =
> [2,9,4294967284,23,4294967270,37,4294967256,51]
>  ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
> -; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI88_0, kind:
> FK_Data_4
> -; X86-AVX-NEXT:    vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding:
> [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
> -; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI88_1, kind:
> FK_Data_4
> -; X86-AVX-NEXT:    retl ## encoding: [0xc3]
> +; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI80_0, kind:
> FK_Data_4
> +; X86-AVX-NEXT:    vpsravd LCPI80_1, %ymm0, %ymm0 ## encoding:
> [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
> +; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI80_1, kind:
> FK_Data_4
> +; X86-AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
>  ; X86-AVX512VL:       ## %bb.0:
> -; X86-AVX512VL-NEXT:    vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX
> Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
> +; X86-AVX512VL-NEXT:    vmovdqa LCPI80_0, %ymm0 ## EVEX TO VEX
> Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
>  ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
> -; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI88_0, kind:
> FK_Data_4
> -; X86-AVX512VL-NEXT:    vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
> -; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI88_1, kind:
> FK_Data_4
> -; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
> +; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI80_0, kind:
> FK_Data_4
> +; X86-AVX512VL-NEXT:    vpsravd LCPI80_1, %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
> +; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI80_1, kind:
> FK_Data_4
> +; X86-AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const:
>  ; X64-AVX:       ## %bb.0:
>  ; X64-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 =
> [2,9,4294967284,23,4294967270,37,4294967256,51]
>  ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
> -; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI88_0-4, kind:
> reloc_riprel_4byte
> +; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI80_0-4, kind:
> reloc_riprel_4byte
>  ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding:
> [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
> -; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI88_1-4, kind:
> reloc_riprel_4byte
> -; X64-AVX-NEXT:    retq ## encoding: [0xc3]
> +; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI80_1-4, kind:
> reloc_riprel_4byte
> +; X64-AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>  ;
>  ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
>  ; X64-AVX512VL:       ## %bb.0:
>  ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX
> Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
>  ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
> -; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI88_0-4, kind:
> reloc_riprel_4byte
> +; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI80_0-4, kind:
> reloc_riprel_4byte
>  ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX
> Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
> -; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI88_1-4, kind:
> reloc_riprel_4byte
> -; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
> +; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI80_1-4, kind:
> reloc_riprel_4byte
> +; X64-AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
>    %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32
> 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1,
> i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
>    ret <8 x i32> %res
>  }
>
> Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll?rev=330322&r1=330321&r2=330322&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll Thu Apr 19
> 05:13:30 2018
> @@ -2694,6 +2694,422 @@ define <32 x i16>@test_int_x86_avx512_ma
>    ret <32 x i16> %res2
>  }
>
> +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16>
> %b) {
> +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
> +; AVX512BW:       ## %bb.0:
> +; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
> +; AVX512BW-NEXT:    retq
> +;
> +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
> +; AVX512F-32:       # %bb.0:
> +; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
> +; AVX512F-32-NEXT:    retl
> +  %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a,
> <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
> +  ret <32 x i16> %res
> +}
> +
> +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16>
> %b, <32 x i16> %passThru, i32 %mask) {
> +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
> +; AVX512BW:       ## %bb.0:
> +; AVX512BW-NEXT:    kmovd %edi, %k1
> +; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
> +; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
> +; AVX512BW-NEXT:    retq
> +;
> +; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
> +; AVX512F-32:       # %bb.0:
> +; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
> +; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
> +; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
> +; AVX512F-32-NEXT:    retl
> +  %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a,
> <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
> +  ret <32 x i16> %res
> +}
> +
> +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x
> i16> %b, i32 %mask) {
> +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
> +; AVX512BW:       ## %bb.0:
> +; AVX512BW-NEXT:    kmovd %edi, %k1
> +; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
> +; AVX512BW-NEXT:    retq
> +;
> +; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
> +; AVX512F-32:       # %bb.0:
> +; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
> +; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-32-NEXT:    retl
> +  %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a,
> <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
> +  ret <32 x i16> %res
> +}
> +
> +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>*
> %ptr_b) {
> +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
> +; AVX512BW:       ## %bb.0:
> +; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
> +; AVX512BW-NEXT:    retq
> +;
> +; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
> +; AVX512F-32:       # %bb.0:
> +; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
> +; AVX512F-32-NEXT:
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180426/0766d7d3/attachment.html>