[llvm] [CostModel][X86] Treat lrint/llrint as fptosi calls (PR #90883)

Fri May 3 09:41:18 PDT 2024

https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/90883

>From abaa3cf93eee4d00349538c2f7f6d16efcb32aaa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 2 May 2024 18:54:13 +0100
Subject: [PATCH] [CostModel][X86] Treat lrint/llrint as fptosi calls

X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which have the same costs as the CVTTP2SI (fptosi) instructions

Followup to #90065
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |   8 +
 .../CostModel/X86/arith-fp-codesize.ll        |  89 ++---
 .../CostModel/X86/arith-fp-latency.ll         | 170 +++------
 .../CostModel/X86/arith-fp-sizelatency.ll     | 170 +++------
 llvm/test/Analysis/CostModel/X86/arith-fp.ll  | 195 +++++-----
 .../SLPVectorizer/X86/arith-fp-call.ll        | 358 ++++++++++++------
 6 files changed, 506 insertions(+), 484 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 6c174f19b10707..c2f80cab537c4e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4153,6 +4153,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       }
     }
     break;
+  case Intrinsic::lrint:
+  case Intrinsic::llrint:
+    if (!ICA.isTypeBasedOnly()) {
+      const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
+      return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
+                              TTI::CastContextHint::None, CostKind);
+    }
+    break;
   case Intrinsic::maxnum:
   case Intrinsic::minnum:
     // FMINNUM has same costs so don't duplicate.
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll
index 5247ef0cbd707b..b965a726262e68 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll
@@ -666,35 +666,46 @@ define i32 @rint(i32 %arg) {
 define i32 @lrint(i32 %arg) {
 ; SSE1-LABEL: 'lrint'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE2-LABEL: 'lrint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX-LABEL: 'lrint'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'lrint'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call i32 @llvm.lrint.i32.f32(float undef)
@@ -711,38 +722,16 @@ define i32 @lrint(i32 %arg) {
 }
 
 define i32 @llrint(i32 %arg) {
-; SSE1-LABEL: 'llrint'
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX-LABEL: 'llrint'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'llrint'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'llrint'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call i64 @llvm.llrint.i64.f32(float undef)
   %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll
index 3f4ecdf340d81b..c147bd2eef6e7e 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll
@@ -1106,79 +1106,79 @@ define i32 @rint(i32 %arg) {
 define i32 @lrint(i32 %arg) {
 ; SSE1-LABEL: 'lrint'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE2-LABEL: 'lrint'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'lrint'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'lrint'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'lrint'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'lrint'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'lrint'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call i32 @llvm.lrint.i32.f32(float undef)
@@ -1195,82 +1195,16 @@ define i32 @lrint(i32 %arg) {
 }
 
 define i32 @llrint(i32 %arg) {
-; SSE1-LABEL: 'llrint'
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SSE2-LABEL: 'llrint'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'llrint'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX-LABEL: 'llrint'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'llrint'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SLM-LABEL: 'llrint'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; GLM-LABEL: 'llrint'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'llrint'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call i64 @llvm.llrint.i64.f32(float undef)
   %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll
index 8040675bef9c5c..d9312ac05601de 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll
@@ -1040,79 +1040,79 @@ define i32 @rint(i32 %arg) {
 define i32 @lrint(i32 %arg) {
 ; SSE1-LABEL: 'lrint'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE2-LABEL: 'lrint'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'lrint'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'lrint'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'lrint'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'lrint'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'lrint'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call i32 @llvm.lrint.i32.f32(float undef)
@@ -1129,82 +1129,16 @@ define i32 @lrint(i32 %arg) {
 }
 
 define i32 @llrint(i32 %arg) {
-; SSE1-LABEL: 'llrint'
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SSE2-LABEL: 'llrint'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'llrint'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX-LABEL: 'llrint'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'llrint'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SLM-LABEL: 'llrint'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; GLM-LABEL: 'llrint'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'llrint'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call i64 @llvm.llrint.i64.f32(float undef)
   %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
index 17bd223affa1d0..90871e3a3831c0 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
@@ -1183,79 +1183,90 @@ define i32 @rint(i32 %arg) {
 define i32 @lrint(i32 %arg) {
 ; SSE1-LABEL: 'lrint'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE2-LABEL: 'lrint'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'lrint'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'lrint'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'lrint'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'lrint'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'lrint'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'lrint'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'lrint'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F32 = call i32 @llvm.lrint.i32.f32(float undef)
@@ -1274,79 +1285,79 @@ define i32 @lrint(i32 %arg) {
 define i32 @llrint(i32 %arg) {
 ; SSE1-LABEL: 'llrint'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE2-LABEL: 'llrint'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'llrint'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'llrint'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'llrint'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'llrint'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'llrint'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F32 = call i64 @llvm.llrint.i64.f32(float undef)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-call.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-call.ll
index 413b3a70ebbb18..0bd152e18fb495 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-call.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-call.ll
@@ -3,8 +3,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v2 -mattr=+prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC128
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v2 -mattr=-prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC128
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 -mattr=+prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC128
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 -mattr=-prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC256
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 -mattr=+prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC256
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 -mattr=-prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC256,VEC256-AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 -mattr=+prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC256,VEC256-AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 -mattr=-prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC512
 
 @f64 = common global [16 x double] zeroinitializer, align 64
@@ -174,32 +174,35 @@ define void @rint_v8f64_v8f64() {
 }
 
 define void @lrint_v8f32_v8i32() {
-; CHECK-LABEL: @lrint_v8f32_v8i32(
-; CHECK-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A0]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A1]])
-; CHECK-NEXT:    [[R2:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A2]])
-; CHECK-NEXT:    [[R3:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A3]])
-; CHECK-NEXT:    [[R4:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A4]])
-; CHECK-NEXT:    [[R5:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A5]])
-; CHECK-NEXT:    [[R6:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A6]])
-; CHECK-NEXT:    [[R7:%.*]] = call i32 @llvm.lrint.i32.f32(float [[A7]])
-; CHECK-NEXT:    store i32 [[R0]], ptr @r32, align 8
-; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 1), align 8
-; CHECK-NEXT:    store i32 [[R2]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 2), align 8
-; CHECK-NEXT:    store i32 [[R3]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 3), align 8
-; CHECK-NEXT:    store i32 [[R4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
-; CHECK-NEXT:    store i32 [[R5]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 5), align 8
-; CHECK-NEXT:    store i32 [[R6]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 6), align 8
-; CHECK-NEXT:    store i32 [[R7]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 7), align 8
-; CHECK-NEXT:    ret void
+; SCALAR-LABEL: @lrint_v8f32_v8i32(
+; SCALAR-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @f32, align 8
+; SCALAR-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP1]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; SCALAR-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP3]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @lrint_v8f32_v8i32(
+; VEC128-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @f32, align 8
+; VEC128-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP1]])
+; VEC128-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; VEC128-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP3]])
+; VEC128-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-LABEL: @lrint_v8f32_v8i32(
+; VEC256-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC256-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> [[TMP1]])
+; VEC256-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC256-NEXT:    ret void
+;
+; VEC512-LABEL: @lrint_v8f32_v8i32(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> [[TMP1]])
+; VEC512-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC512-NEXT:    ret void
 ;
   %a0 = load float, ptr @f32, align 8
   %a1 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
@@ -229,32 +232,35 @@ define void @lrint_v8f32_v8i32() {
 }
 
 define void @lrint_v8f64_v8i32() {
-; CHECK-LABEL: @lrint_v8f64_v8i32(
-; CHECK-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A0]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A1]])
-; CHECK-NEXT:    [[R2:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A2]])
-; CHECK-NEXT:    [[R3:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A3]])
-; CHECK-NEXT:    [[R4:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A4]])
-; CHECK-NEXT:    [[R5:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A5]])
-; CHECK-NEXT:    [[R6:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A6]])
-; CHECK-NEXT:    [[R7:%.*]] = call i32 @llvm.lrint.i32.f64(double [[A7]])
-; CHECK-NEXT:    store i32 [[R0]], ptr @r32, align 8
-; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 1), align 8
-; CHECK-NEXT:    store i32 [[R2]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 2), align 8
-; CHECK-NEXT:    store i32 [[R3]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 3), align 8
-; CHECK-NEXT:    store i32 [[R4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
-; CHECK-NEXT:    store i32 [[R5]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 5), align 8
-; CHECK-NEXT:    store i32 [[R6]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 6), align 8
-; CHECK-NEXT:    store i32 [[R7]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 7), align 8
-; CHECK-NEXT:    ret void
+; SCALAR-LABEL: @lrint_v8f64_v8i32(
+; SCALAR-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @f64, align 8
+; SCALAR-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP1]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; SCALAR-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP3]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @lrint_v8f64_v8i32(
+; VEC128-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @f64, align 8
+; VEC128-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP1]])
+; VEC128-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; VEC128-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP3]])
+; VEC128-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-LABEL: @lrint_v8f64_v8i32(
+; VEC256-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @f64, align 8
+; VEC256-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> [[TMP1]])
+; VEC256-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC256-NEXT:    ret void
+;
+; VEC512-LABEL: @lrint_v8f64_v8i32(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @f64, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> [[TMP1]])
+; VEC512-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC512-NEXT:    ret void
 ;
   %a0 = load double, ptr @f64, align 8
   %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
@@ -284,32 +290,101 @@ define void @lrint_v8f64_v8i32() {
 }
 
 define void @llrint_v8f32_v8i64() {
-; CHECK-LABEL: @llrint_v8f32_v8i64(
-; CHECK-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
-; CHECK-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A0]])
-; CHECK-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A1]])
-; CHECK-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A2]])
-; CHECK-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A3]])
-; CHECK-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A4]])
-; CHECK-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A5]])
-; CHECK-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A6]])
-; CHECK-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A7]])
-; CHECK-NEXT:    store i64 [[R0]], ptr @r64, align 8
-; CHECK-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
-; CHECK-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
-; CHECK-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
-; CHECK-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
-; CHECK-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
-; CHECK-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
-; CHECK-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
-; CHECK-NEXT:    ret void
+; SCALAR-LABEL: @llrint_v8f32_v8i64(
+; SCALAR-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
+; SCALAR-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; SCALAR-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; SCALAR-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; SCALAR-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; SCALAR-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; SCALAR-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; SCALAR-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A0]])
+; SCALAR-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A1]])
+; SCALAR-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A2]])
+; SCALAR-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A3]])
+; SCALAR-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A4]])
+; SCALAR-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A5]])
+; SCALAR-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A6]])
+; SCALAR-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A7]])
+; SCALAR-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; SCALAR-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @llrint_v8f32_v8i64(
+; VEC128-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
+; VEC128-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; VEC128-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; VEC128-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; VEC128-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; VEC128-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; VEC128-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; VEC128-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A0]])
+; VEC128-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A1]])
+; VEC128-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A2]])
+; VEC128-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A3]])
+; VEC128-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A4]])
+; VEC128-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A5]])
+; VEC128-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A6]])
+; VEC128-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A7]])
+; VEC128-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC128-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC128-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC128-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC128-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC128-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC128-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC128-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-AVX2-LABEL: @llrint_v8f32_v8i64(
+; VEC256-AVX2-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
+; VEC256-AVX2-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A0]])
+; VEC256-AVX2-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A1]])
+; VEC256-AVX2-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A2]])
+; VEC256-AVX2-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A3]])
+; VEC256-AVX2-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A4]])
+; VEC256-AVX2-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A5]])
+; VEC256-AVX2-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A6]])
+; VEC256-AVX2-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A7]])
+; VEC256-AVX2-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC256-AVX2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    ret void
+;
+; VEC256-AVX512-LABEL: @llrint_v8f32_v8i64(
+; VEC256-AVX512-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @f32, align 8
+; VEC256-AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP1]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP2]], ptr @r64, align 8
+; VEC256-AVX512-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP3]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    ret void
+;
+; VEC512-LABEL: @llrint_v8f32_v8i64(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> [[TMP1]])
+; VEC512-NEXT:    store <8 x i64> [[TMP2]], ptr @r64, align 8
+; VEC512-NEXT:    ret void
 ;
   %a0 = load float, ptr @f32, align 8
   %a1 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
@@ -339,32 +414,101 @@ define void @llrint_v8f32_v8i64() {
 }
 
 define void @llrint_v8f64_v8i64() {
-; CHECK-LABEL: @llrint_v8f64_v8i64(
-; CHECK-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
-; CHECK-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A0]])
-; CHECK-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A1]])
-; CHECK-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A2]])
-; CHECK-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A3]])
-; CHECK-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A4]])
-; CHECK-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A5]])
-; CHECK-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A6]])
-; CHECK-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A7]])
-; CHECK-NEXT:    store i64 [[R0]], ptr @r64, align 8
-; CHECK-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
-; CHECK-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
-; CHECK-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
-; CHECK-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
-; CHECK-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
-; CHECK-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
-; CHECK-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
-; CHECK-NEXT:    ret void
+; SCALAR-LABEL: @llrint_v8f64_v8i64(
+; SCALAR-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
+; SCALAR-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A0]])
+; SCALAR-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A1]])
+; SCALAR-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A2]])
+; SCALAR-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A3]])
+; SCALAR-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A4]])
+; SCALAR-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A5]])
+; SCALAR-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A6]])
+; SCALAR-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A7]])
+; SCALAR-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; SCALAR-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @llrint_v8f64_v8i64(
+; VEC128-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
+; VEC128-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; VEC128-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; VEC128-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; VEC128-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; VEC128-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; VEC128-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; VEC128-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A0]])
+; VEC128-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A1]])
+; VEC128-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A2]])
+; VEC128-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A3]])
+; VEC128-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A4]])
+; VEC128-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A5]])
+; VEC128-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A6]])
+; VEC128-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A7]])
+; VEC128-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC128-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC128-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC128-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC128-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC128-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC128-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC128-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-AVX2-LABEL: @llrint_v8f64_v8i64(
+; VEC256-AVX2-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
+; VEC256-AVX2-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A0]])
+; VEC256-AVX2-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A1]])
+; VEC256-AVX2-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A2]])
+; VEC256-AVX2-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A3]])
+; VEC256-AVX2-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A4]])
+; VEC256-AVX2-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A5]])
+; VEC256-AVX2-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A6]])
+; VEC256-AVX2-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A7]])
+; VEC256-AVX2-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC256-AVX2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    ret void
+;
+; VEC256-AVX512-LABEL: @llrint_v8f64_v8i64(
+; VEC256-AVX512-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @f64, align 8
+; VEC256-AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> [[TMP1]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP2]], ptr @r64, align 8
+; VEC256-AVX512-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> [[TMP3]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    ret void
+;
+; VEC512-LABEL: @llrint_v8f64_v8i64(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @f64, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> [[TMP1]])
+; VEC512-NEXT:    store <8 x i64> [[TMP2]], ptr @r64, align 8
+; VEC512-NEXT:    ret void
 ;
   %a0 = load double, ptr @f64, align 8
   %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
@@ -392,3 +536,5 @@ define void @llrint_v8f64_v8i64() {
   store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}