[llvm] r351957 - [IR] Match intrinsic parameter by scalar/vectorwidth

Wed Jan 23 08:00:23 PST 2019

Author: rksimon
Date: Wed Jan 23 08:00:22 2019
New Revision: 351957

URL: http://llvm.org/viewvc/llvm-project?rev=351957&view=rev
Log:
[IR] Match intrinsic parameter by scalar/vectorwidth

This patch replaces the existing LLVMVectorSameWidth matcher with LLVMScalarOrSameVectorWidth.

The matching args must be either scalars or vectors with the same number of elements, but in either case the scalar/element type can differ, specified by LLVMScalarOrSameVectorWidth.

I've updated the _overflow intrinsics to demonstrate this - allowing it to return a i1 or <N x i1> overflow result, matching the scalar/vectorwidth of the other (add/sub/mul) result type.

The masked load/store/gather/scatter intrinsics have also been updated to use this, although as we specify the reference type to be llvm_anyvector_ty we guarantee the mask will be <N x i1> so no change in behaviour

Differential Revision: https://reviews.llvm.org/D57090

Added:
    llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll
Modified:
    llvm/trunk/include/llvm/IR/Intrinsics.td
    llvm/trunk/lib/IR/Function.cpp
    llvm/trunk/utils/TableGen/CodeGenTarget.cpp
    llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp

Modified: llvm/trunk/include/llvm/IR/Intrinsics.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/Intrinsics.td?rev=351957&r1=351956&r2=351957&view=diff
==============================================================================

--- llvm/trunk/include/llvm/IR/Intrinsics.td (original)
+++ llvm/trunk/include/llvm/IR/Intrinsics.td Wed Jan 23 08:00:22 2019
@@ -156,10 +156,15 @@ class LLVMMatchType<int num>
 // the intrinsic is overloaded, so the matched type should be declared as iAny.
 class LLVMExtendedType<int num> : LLVMMatchType<num>;
 class LLVMTruncatedType<int num> : LLVMMatchType<num>;
-class LLVMVectorSameWidth<int num, LLVMType elty>
-  : LLVMMatchType<num> {
+
+// Match the scalar/vector of another intrinsic parameter but with a different
+// element type. Either both are scalars or both are vectors with the same
+// number of elements.
+class LLVMScalarOrSameVectorWidth<int idx, LLVMType elty>
+  : LLVMMatchType<idx> {
   ValueType ElTy = elty.VT;
 }
+
 class LLVMPointerTo<int num> : LLVMMatchType<num>;
 class LLVMPointerToElt<int num> : LLVMMatchType<num>;
 class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;
@@ -796,24 +801,30 @@ def int_adjust_trampoline : Intrinsic<[l
 //
 
 // Expose the carry flag from add operations on two integrals.
-def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
-def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
 
-def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
-def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
 
-def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
-def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
 
@@ -1001,35 +1012,35 @@ def int_is_constant : Intrinsic<[llvm_i1
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
                                       LLVMAnyPointerType<LLVMMatchType<0>>,
                                       llvm_i32_ty,
-                                      LLVMVectorSameWidth<0, llvm_i1_ty>],
+                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                  [IntrArgMemOnly]>;
 
 def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
                                  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
-                                  LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
+                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
                                  [IntrReadMem, IntrArgMemOnly]>;
 
 def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
                                  [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
-                                  LLVMVectorSameWidth<0, llvm_i1_ty>,
+                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                   LLVMMatchType<0>],
                                  [IntrReadMem]>;
 
 def int_masked_scatter: Intrinsic<[],
                                   [llvm_anyvector_ty,
                                    LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
-                                   LLVMVectorSameWidth<0, llvm_i1_ty>]>;
+                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>]>;
 
 def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
                                      [LLVMPointerToElt<0>,
-                                      LLVMVectorSameWidth<0, llvm_i1_ty>,
+                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                       LLVMMatchType<0>],
                                      [IntrReadMem]>;
 
 def int_masked_compressstore: Intrinsic<[],
                                      [llvm_anyvector_ty,
                                       LLVMPointerToElt<0>,
-                                      LLVMVectorSameWidth<0, llvm_i1_ty>],
+                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                      [IntrArgMemOnly]>;
 
 // Test whether a pointer is associated with a type metadata identifier.

Modified: llvm/trunk/lib/IR/Function.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/IR/Function.cpp?rev=351957&r1=351956&r2=351957&view=diff
==============================================================================
--- llvm/trunk/lib/IR/Function.cpp (original)
+++ llvm/trunk/lib/IR/Function.cpp Wed Jan 23 08:00:22 2019
@@ -948,10 +948,9 @@ static Type *DecodeFixedType(ArrayRef<In
   case IITDescriptor::SameVecWidthArgument: {
     Type *EltTy = DecodeFixedType(Infos, Tys, Context);
     Type *Ty = Tys[D.getArgumentNumber()];
-    if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (auto *VTy = dyn_cast<VectorType>(Ty))
       return VectorType::get(EltTy, VTy->getNumElements());
-    }
-    llvm_unreachable("unhandled");
+    return EltTy;
   }
   case IITDescriptor::PtrToArgument: {
     Type *Ty = Tys[D.getArgumentNumber()];
@@ -1135,15 +1134,19 @@ bool Intrinsic::matchIntrinsicType(Type
     case IITDescriptor::SameVecWidthArgument: {
       if (D.getArgumentNumber() >= ArgTys.size())
         return true;
-      VectorType * ReferenceType =
-        dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
-      VectorType *ThisArgType = dyn_cast<VectorType>(Ty);
-      if (!ThisArgType || !ReferenceType ||
-          (ReferenceType->getVectorNumElements() !=
-           ThisArgType->getVectorNumElements()))
-        return true;
-      return matchIntrinsicType(ThisArgType->getVectorElementType(),
-                                Infos, ArgTys);
+      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+      auto *ThisArgType = dyn_cast<VectorType>(Ty);
+      // Both must be vectors of the same number of elements or neither.
+      if ((ReferenceType != nullptr) != (ThisArgType != nullptr))
+        return true;
+      Type *EltTy = Ty;
+      if (ThisArgType) {
+        if (ReferenceType->getVectorNumElements() !=
+            ThisArgType->getVectorNumElements())
+          return true;
+        EltTy = ThisArgType->getVectorElementType();
+      }
+      return matchIntrinsicType(EltTy, Infos, ArgTys);
     }
     case IITDescriptor::PtrToArgument: {
       if (D.getArgumentNumber() >= ArgTys.size())

Added: llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll?rev=351957&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll (added)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll Wed Jan 23 08:00:22 2019
@@ -0,0 +1,414 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,GLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,BTVER2
+
+;
+; sadd.with.overflow
+;
+
+declare {i64, i1}              @llvm.sadd.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.sadd.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.sadd.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.sadd.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.sadd.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.sadd.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.sadd.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.sadd.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.sadd.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @sadd(i32 %arg) {
+; CHECK-LABEL: 'sadd'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; uadd.with.overflow
+;
+
+declare {i64, i1}              @llvm.uadd.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.uadd.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.uadd.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.uadd.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.uadd.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.uadd.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.uadd.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.uadd.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.uadd.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @uadd(i32 %arg) {
+; CHECK-LABEL: 'uadd'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; ssub.with.overflow
+;
+
+declare {i64, i1}              @llvm.ssub.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.ssub.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.ssub.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.ssub.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.ssub.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.ssub.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.ssub.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.ssub.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.ssub.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @ssub(i32 %arg) {
+; CHECK-LABEL: 'ssub'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; usub.with.overflow
+;
+
+declare {i64, i1}              @llvm.usub.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.usub.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.usub.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.usub.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.usub.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.usub.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.usub.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.usub.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.usub.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @usub(i32 %arg) {
+; CHECK-LABEL: 'usub'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; smul.with.overflow
+;
+
+declare {i64, i1}              @llvm.smul.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.smul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.smul.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.smul.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.smul.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.smul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.smul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.smul.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.smul.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.smul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.smul.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.smul.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.smul.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.smul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.smul.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @smul(i32 %arg) {
+; CHECK-LABEL: 'smul'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; umul.with.overflow
+;
+
+declare {i64, i1}              @llvm.umul.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.umul.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.umul.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.umul.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.umul.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.umul.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.umul.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.umul.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @umul(i32 %arg) {
+; CHECK-LABEL: 'umul'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}

Modified: llvm/trunk/utils/TableGen/CodeGenTarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/TableGen/CodeGenTarget.cpp?rev=351957&r1=351956&r2=351957&view=diff
==============================================================================
--- llvm/trunk/utils/TableGen/CodeGenTarget.cpp (original)
+++ llvm/trunk/utils/TableGen/CodeGenTarget.cpp Wed Jan 23 08:00:22 2019
@@ -633,7 +633,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Recor
       // overloaded, all the types can be specified directly.
       assert(((!TyEl->isSubClassOf("LLVMExtendedType") &&
                !TyEl->isSubClassOf("LLVMTruncatedType") &&
-               !TyEl->isSubClassOf("LLVMVectorSameWidth")) ||
+               !TyEl->isSubClassOf("LLVMScalarOrSameVectorWidth")) ||
               VT == MVT::iAny || VT == MVT::vAny) &&
              "Expected iAny or vAny type");
     } else

Modified: llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp?rev=351957&r1=351956&r2=351957&view=diff
==============================================================================
--- llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp (original)
+++ llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp Wed Jan 23 08:00:22 2019
@@ -269,7 +269,7 @@ static void EncodeFixedType(Record *R, s
       Sig.push_back(IIT_TRUNC_ARG);
     else if (R->isSubClassOf("LLVMHalfElementsVectorType"))
       Sig.push_back(IIT_HALF_VEC_ARG);
-    else if (R->isSubClassOf("LLVMVectorSameWidth")) {
+    else if (R->isSubClassOf("LLVMScalarOrSameVectorWidth")) {
       Sig.push_back(IIT_SAME_VEC_WIDTH_ARG);
       Sig.push_back((Number << 3) | ArgCodes[Number]);
       MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy"));