[llvm] r302582 - [AArch64] Consider widening instructions in cost calculations

Tue May 9 13:18:12 PDT 2017

Author: mssimpso
Date: Tue May  9 15:18:12 2017
New Revision: 302582

URL: http://llvm.org/viewvc/llvm-project?rev=302582&view=rev
Log:
[AArch64] Consider widening instructions in cost calculations

The AArch64 instruction set has a few "widening" instructions (e.g., uaddl,
saddl, uaddw, etc.) that take one or more doubleword operands and produce
quadword results. The operands are automatically sign- or zero-extended as
appropriate. However, in LLVM IR, these extends are explicit. This patch
updates TTI to consider these widening instructions as single operations whose
cost is attached to the arithmetic instruction. It marks extends that are part
of a widening operation "free" and applies a sub-target specified overhead
(zero by default) to the arithmetic instructions.

Differential Revision: https://reviews.llvm.org/D32706

Added:
    llvm/trunk/test/Analysis/CostModel/AArch64/free-widening-casts.ll
Modified:
    llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h
    llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

Modified: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h?rev=302582&r1=302581&r2=302582&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h Tue May  9 15:18:12 2017
@@ -106,6 +106,7 @@ protected:
   unsigned PrefFunctionAlignment = 0;
   unsigned PrefLoopAlignment = 0;
   unsigned MaxJumpTableSize = 0;
+  unsigned WideningBaseCost = 0;
 
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
@@ -228,6 +229,8 @@ public:
 
   unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
 
+  unsigned getWideningBaseCost() const { return WideningBaseCost; }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;

Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp?rev=302582&r1=302581&r2=302582&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Tue May  9 15:18:12 2017
@@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigne
   return TTI::PSK_Software;
 }
 
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+                                           ArrayRef<const Value *> Args) {
+
+  // A helper that returns a vector type from the given type. The number of
+  // elements in type Ty determine the vector width.
+  auto toVectorTy = [&](Type *ArgTy) {
+    return VectorType::get(ArgTy->getScalarType(),
+                           DstTy->getVectorNumElements());
+  };
+
+  // Exit early if DstTy is not a vector type whose elements are at least
+  // 16-bits wide.
+  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+    return false;
+
+  // Determine if the operation has a widening variant. We consider both the
+  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+  // instructions.
+  //
+  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+  //       verify that their extending operands are eliminated during code
+  //       generation.
+  switch (Opcode) {
+  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+    break;
+  default:
+    return false;
+  }
+
+  // To be a widening instruction (either the "wide" or "long" versions), the
+  // second operand must be a sign- or zero extend having a single user. We
+  // only consider extends having a single user because they may otherwise not
+  // be eliminated.
+  if (Args.size() != 2 ||
+      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+      !Args[1]->hasOneUse())
+    return false;
+  auto *Extend = cast<CastInst>(Args[1]);
+
+  // Legalize the destination type and ensure it can be used in a widening
+  // operation.
+  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+    return false;
+
+  // Legalize the source type and ensure it can be used in a widening
+  // operation.
+  Type *SrcTy = toVectorTy(Extend->getSrcTy());
+  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+    return false;
+
+  // Get the total number of vector elements in the legalized types.
+  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+  // Return true if the legalized types have the same number of vector elements
+  // and the destination element type size is twice that of the source type.
+  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // If the cast is observable, and it is used by a widening instruction (e.g.,
+  // uaddl, saddw, etc.), it may be free.
+  if (I && I->hasOneUse()) {
+    auto *SingleUser = cast<Instruction>(*I->user_begin());
+    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+      // If the cast is the second operand, it is free. We will generate either
+      // a "wide" or "long" version of the widening instruction.
+      if (I == SingleUser->getOperand(1))
+        return 0;
+      // If the cast is not the second operand, it will be free if it looks the
+      // same as the second operand. In this case, we will generate a "long"
+      // version of the widening instruction.
+      if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+        if (I->getOpcode() == Cast->getOpcode() &&
+            cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+          return 0;
+    }
+  }
+
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
@@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCo
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
+  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+  // add in the widening overhead specified by the sub-target. Since the
+  // extends feeding widening instructions are performed automatically, they
+  // aren't present in the generated code and have a zero cost. By adding a
+  // widening overhead here, we attach the total cost of the combined operation
+  // to the widening instruction.
+  int Cost = 0;
+  if (isWideningInstruction(Ty, Opcode, Args))
+    Cost += ST->getWideningBaseCost();
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
   if (ISD == ISD::SDIV &&
@@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCo
     // normally expanded to the sequence ADD + CMP + SELECT + SRA.
     // The OperandValue properties many not be same as that of previous
     // operation; conservatively assume OP_None.
-    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                                      TargetTransformInfo::OP_None,
-                                      TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
     Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
                                    TargetTransformInfo::OP_None,
                                    TargetTransformInfo::OP_None);
@@ -405,8 +499,8 @@ int AArch64TTIImpl::getArithmeticInstrCo
 
   switch (ISD) {
   default:
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -414,7 +508,7 @@ int AArch64TTIImpl::getArithmeticInstrCo
   case ISD::AND:
     // These nodes are marked as 'custom' for combining purposes only.
     // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
+    return (Cost + 1) * LT.first;
   }
 }
 

Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h?rev=302582&r1=302581&r2=302582&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h Tue May  9 15:18:12 2017
@@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIIm
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
+  bool isWideningInstruction(Type *Ty, unsigned Opcode,
+                             ArrayRef<const Value *> Args);
+
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=302582&r1=302581&r2=302582&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Tue May  9 15:18:12 2017
@@ -1819,11 +1819,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
           CInt->getValue().isPowerOf2())
         Op2VP = TargetTransformInfo::OP_PowerOf2;
 
-      int ScalarCost = VecTy->getNumElements() *
-                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
-                                                   Op2VK, Op1VP, Op2VP);
+      SmallVector<const Value *, 4> Operands(VL0->operand_values());
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+                                      Op2VP, Operands);
       int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP);
+                                                Op1VP, Op2VP, Operands);
       return VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {

Added: llvm/trunk/test/Analysis/CostModel/AArch64/free-widening-casts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AArch64/free-widening-casts.ll?rev=302582&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AArch64/free-widening-casts.ll (added)
+++ llvm/trunk/test/Analysis/CostModel/AArch64/free-widening-casts.ll Tue May  9 15:18:12 2017
@@ -0,0 +1,622 @@
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -cost-model -analyze | FileCheck %s --check-prefix=COST
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
+
+; COST-LABEL: uaddl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: uaddl_8h
+; CODE:       uaddl v0.8h, v0.8b, v1.8b
+define <8 x i16> @uaddl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: uaddl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: uaddl_4s
+; CODE:       uaddl v0.4s, v0.4h, v1.4h
+define <4 x i32> @uaddl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: uaddl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: uaddl_2d
+; CODE:       uaddl v0.2d, v0.2s, v1.2s
+define <2 x i64> @uaddl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = zext <2 x i32> %b to <2 x i64>
+  %tmp2 = add <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: uaddl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: uaddl2_8h
+; CODE:       uaddl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  uaddl v0.8h, v0.8b, v1.8b
+define <16 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = zext <16 x i8> %b to <16 x i16>
+  %tmp2 = add <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: uaddl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: uaddl2_4s
+; CODE:       uaddl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  uaddl v0.4s, v0.4h, v1.4h
+define <8 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = zext <8 x i16> %b to <8 x i32>
+  %tmp2 = add <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: uaddl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: uaddl2_2d
+; CODE:       uaddl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  uaddl v0.2d, v0.2s, v1.2s
+define <4 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = zext <4 x i32> %b to <4 x i64>
+  %tmp2 = add <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: saddl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: saddl_8h
+; CODE:       saddl v0.8h, v0.8b, v1.8b
+define <8 x i16> @saddl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: saddl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: saddl_4s
+; CODE:       saddl v0.4s, v0.4h, v1.4h
+define <4 x i32> @saddl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: saddl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: saddl_2d
+; CODE:       saddl v0.2d, v0.2s, v1.2s
+define <2 x i64> @saddl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sext <2 x i32> %b to <2 x i64>
+  %tmp2 = add <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: saddl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: saddl2_8h
+; CODE:       saddl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  saddl v0.8h, v0.8b, v1.8b
+define <16 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sext <16 x i8> %b to <16 x i16>
+  %tmp2 = add <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: saddl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: saddl2_4s
+; CODE:       saddl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  saddl v0.4s, v0.4h, v1.4h
+define <8 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sext <8 x i16> %b to <8 x i32>
+  %tmp2 = add <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: saddl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: saddl2_2d
+; CODE:       saddl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  saddl v0.2d, v0.2s, v1.2s
+define <4 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sext <4 x i32> %b to <4 x i64>
+  %tmp2 = add <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: usubl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: usubl_8h
+; CODE:       usubl v0.8h, v0.8b, v1.8b
+define <8 x i16> @usubl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = sub <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: usubl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: usubl_4s
+; CODE:       usubl v0.4s, v0.4h, v1.4h
+define <4 x i32> @usubl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = sub <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: usubl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: usubl_2d
+; CODE:       usubl v0.2d, v0.2s, v1.2s
+define <2 x i64> @usubl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = zext <2 x i32> %b to <2 x i64>
+  %tmp2 = sub <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: usubl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: usubl2_8h
+; CODE:       usubl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  usubl v0.8h, v0.8b, v1.8b
+define <16 x i16> @usubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = zext <16 x i8> %b to <16 x i16>
+  %tmp2 = sub <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: usubl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: usubl2_4s
+; CODE:       usubl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  usubl v0.4s, v0.4h, v1.4h
+define <8 x i32> @usubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = zext <8 x i16> %b to <8 x i32>
+  %tmp2 = sub <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: usubl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: usubl2_2d
+; CODE:       usubl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  usubl v0.2d, v0.2s, v1.2s
+define <4 x i64> @usubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = zext <4 x i32> %b to <4 x i64>
+  %tmp2 = sub <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: ssubl_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: ssubl_8h
+; CODE:       ssubl v0.8h, v0.8b, v1.8b
+define <8 x i16> @ssubl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sext <8 x i8> %b to <8 x i16>
+  %tmp2 = sub <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: ssubl_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: ssubl_4s
+; CODE:       ssubl v0.4s, v0.4h, v1.4h
+define <4 x i32> @ssubl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sext <4 x i16> %b to <4 x i32>
+  %tmp2 = sub <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: ssubl_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: ssubl_2d
+; CODE:       ssubl v0.2d, v0.2s, v1.2s
+define <2 x i64> @ssubl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sext <2 x i32> %b to <2 x i64>
+  %tmp2 = sub <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: ssubl2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: ssubl2_8h
+; CODE:       ssubl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT:  ssubl v0.8h, v0.8b, v1.8b
+define <16 x i16> @ssubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sext <16 x i8> %b to <16 x i16>
+  %tmp2 = sub <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: ssubl2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: ssubl2_4s
+; CODE:       ssubl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT:  ssubl v0.4s, v0.4h, v1.4h
+define <8 x i32> @ssubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sext <8 x i16> %b to <8 x i32>
+  %tmp2 = sub <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: ssubl2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: ssubl2_2d
+; CODE:       ssubl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT:  ssubl v0.2d, v0.2s, v1.2s
+define <4 x i64> @ssubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sext <4 x i32> %b to <4 x i64>
+  %tmp2 = sub <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: uaddw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: uaddw_8h
+; CODE:       uaddw v0.8h, v1.8h, v0.8b
+define <8 x i16> @uaddw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = add <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: uaddw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: uaddw_4s
+; CODE:       uaddw v0.4s, v1.4s, v0.4h
+define <4 x i32> @uaddw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = add <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: uaddw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: uaddw_2d
+; CODE:       uaddw v0.2d, v1.2d, v0.2s
+define <2 x i64> @uaddw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = add <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: uaddw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: uaddw2_8h
+; CODE:       uaddw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  uaddw v0.8h, v1.8h, v0.8b
+define <16 x i16> @uaddw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = add <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: uaddw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: uaddw2_4s
+; CODE:       uaddw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  uaddw v0.4s, v1.4s, v0.4h
+define <8 x i32> @uaddw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = add <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: uaddw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: uaddw2_2d
+; CODE:       uaddw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  uaddw v0.2d, v1.2d, v0.2s
+define <4 x i64> @uaddw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = add <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: saddw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: saddw_8h
+; CODE:       saddw v0.8h, v1.8h, v0.8b
+define <8 x i16> @saddw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = add <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: saddw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: saddw_4s
+; CODE:       saddw v0.4s, v1.4s, v0.4h
+define <4 x i32> @saddw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = add <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: saddw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: saddw_2d
+; CODE:       saddw v0.2d, v1.2d, v0.2s
+define <2 x i64> @saddw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = add <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: saddw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: saddw2_8h
+; CODE:       saddw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  saddw v0.8h, v1.8h, v0.8b
+define <16 x i16> @saddw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = add <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: saddw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: saddw2_4s
+; CODE:       saddw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  saddw v0.4s, v1.4s, v0.4h
+define <8 x i32> @saddw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = add <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: saddw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: saddw2_2d
+; CODE:       saddw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  saddw v0.2d, v1.2d, v0.2s
+define <4 x i64> @saddw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = add <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: usubw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: usubw_8h
+; CODE:       usubw v0.8h, v1.8h, v0.8b
+define <8 x i16> @usubw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = sub <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: usubw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: usubw_4s
+; CODE:       usubw v0.4s, v1.4s, v0.4h
+define <4 x i32> @usubw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = sub <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: usubw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: usubw_2d
+; CODE:       usubw v0.2d, v1.2d, v0.2s
+define <2 x i64> @usubw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = sub <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: usubw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: usubw2_8h
+; CODE:       usubw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  usubw v0.8h, v1.8h, v0.8b
+define <16 x i16> @usubw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = sub <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: usubw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: usubw2_4s
+; CODE:       usubw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  usubw v0.4s, v1.4s, v0.4h
+define <8 x i32> @usubw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = sub <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: usubw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: usubw2_2d
+; CODE:       usubw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  usubw v0.2d, v1.2d, v0.2s
+define <4 x i64> @usubw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = sub <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: ssubw_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: ssubw_8h
+; CODE:       ssubw v0.8h, v1.8h, v0.8b
+define <8 x i16> @ssubw_8h(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sub <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: ssubw_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: ssubw_4s
+; CODE:       ssubw v0.4s, v1.4s, v0.4h
+define <4 x i32> @ssubw_4s(<4 x i16> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sub <4 x i32> %b, %tmp0
+  ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: ssubw_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: ssubw_2d
+; CODE:       ssubw v0.2d, v1.2d, v0.2s
+define <2 x i64> @ssubw_2d(<2 x i32> %a, <2 x i64> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sub <2 x i64> %b, %tmp0
+  ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: ssubw2_8h
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: ssubw2_8h
+; CODE:       ssubw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT:  ssubw v0.8h, v1.8h, v0.8b
+define <16 x i16> @ssubw2_8h(<16 x i8> %a, <16 x i16> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sub <16 x i16> %b, %tmp0
+  ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: ssubw2_4s
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: ssubw2_4s
+; CODE:       ssubw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT:  ssubw v0.4s, v1.4s, v0.4h
+define <8 x i32> @ssubw2_4s(<8 x i16> %a, <8 x i32> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sub <8 x i32> %b, %tmp0
+  ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: ssubw2_2d
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: ssubw2_2d
+; CODE:       ssubw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT:  ssubw v0.2d, v1.2d, v0.2s
+define <4 x i64> @ssubw2_2d(<4 x i32> %a, <4 x i64> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sub <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: neg_wrong_operand_order
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+define <8 x i16> @neg_wrong_operand_order(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = sub <8 x i16> %tmp0, %b
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: neg_non_widening_op
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+define <8 x i16> @neg_non_widening_op(<8 x i8> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = udiv <8 x i16> %b, %tmp0
+  ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: neg_dissimilar_operand_kind_0
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+define <8 x i16> @neg_dissimilar_operand_kind_0(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: neg_dissimilar_operand_kind_1
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <4 x i8> %a to <4 x i32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+define <4 x i32> @neg_dissimilar_operand_kind_1(<4 x i8> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i8> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: neg_illegal_vector_type_0
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <16 x i4> %a to <16 x i8>
+define <16 x i8> @neg_illegal_vector_type_0(<16 x i4> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i4> %a to <16 x i8>
+  %tmp1 = sub <16 x i8> %b, %tmp0
+  ret <16 x i8> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_1
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <1 x i16> %a to <1 x i32>
+define <1 x i32> @neg_llegal_vector_type_1(<1 x i16> %a, <1 x i32> %b) {
+  %tmp0 = zext <1 x i16> %a to <1 x i32>
+  %tmp1 = add <1 x i32> %b, %tmp0
+  ret <1 x i32> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_2
+; COST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i64>
+define <4 x i64> @neg_llegal_vector_type_2(<4 x i16> %a, <4 x i64> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i64>
+  %tmp1 = add <4 x i64> %b, %tmp0
+  ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_3
+; COST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68>
+define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) {
+  %tmp0 = zext <3 x i34> %a to <3 x i68>
+  %tmp1 = add <3 x i68> %b, %tmp0
+  ret <3 x i68> %tmp1
+}