[llvm] 0767e43 - [CostModel][X86] Adjust bitreverse/ctpop/ctlz/cttz AVX2+ costs based on llvm-mca reports

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 15 05:17:32 PDT 2021


Author: Simon Pilgrim
Date: 2021-09-15T13:04:40+01:00
New Revision: 0767e43d87453e76e418cabb2b96fdacd1a4981c

URL: https://github.com/llvm/llvm-project/commit/0767e43d87453e76e418cabb2b96fdacd1a4981c
DIFF: https://github.com/llvm/llvm-project/commit/0767e43d87453e76e418cabb2b96fdacd1a4981c.diff

LOG: [CostModel][X86] Adjust bitreverse/ctpop/ctlz/cttz AVX2+ costs based on llvm-mca reports

Based off the worse case numbers generated by D103695, the AVX2/512 bit reversing/counting costs were higher than necessary (based off instruction counts instead of actual throughput).

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/bitreverse.ll
    llvm/test/Analysis/CostModel/X86/ctlz.ll
    llvm/test/Analysis/CostModel/X86/ctpop.ll
    llvm/test/Analysis/CostModel/X86/cttz.ll
    llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
    llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
    llvm/test/Transforms/SLPVectorizer/X86/cttz.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 97b68752ef6f7..9feba110ce0a7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2628,10 +2628,10 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   static const CostTblEntry AVX512BWCostTbl[] = {
     { ISD::ABS,        MVT::v32i16,  1 },
     { ISD::ABS,        MVT::v64i8,   1 },
-    { ISD::BITREVERSE, MVT::v8i64,   5 },
-    { ISD::BITREVERSE, MVT::v16i32,  5 },
-    { ISD::BITREVERSE, MVT::v32i16,  5 },
-    { ISD::BITREVERSE, MVT::v64i8,   5 },
+    { ISD::BITREVERSE, MVT::v8i64,   3 },
+    { ISD::BITREVERSE, MVT::v16i32,  3 },
+    { ISD::BITREVERSE, MVT::v32i16,  3 },
+    { ISD::BITREVERSE, MVT::v64i8,   2 },
     { ISD::BSWAP,      MVT::v8i64,   1 },
     { ISD::BSWAP,      MVT::v16i32,  1 },
     { ISD::BSWAP,      MVT::v32i16,  1 },
@@ -2758,25 +2758,41 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ABS,        MVT::v8i32,   1 },
     { ISD::ABS,        MVT::v16i16,  1 },
     { ISD::ABS,        MVT::v32i8,   1 },
-    { ISD::BITREVERSE, MVT::v4i64,   5 },
-    { ISD::BITREVERSE, MVT::v8i32,   5 },
-    { ISD::BITREVERSE, MVT::v16i16,  5 },
-    { ISD::BITREVERSE, MVT::v32i8,   5 },
+    { ISD::BITREVERSE, MVT::v2i64,   3 },
+    { ISD::BITREVERSE, MVT::v4i64,   3 },
+    { ISD::BITREVERSE, MVT::v4i32,   3 },
+    { ISD::BITREVERSE, MVT::v8i32,   3 },
+    { ISD::BITREVERSE, MVT::v8i16,   3 },
+    { ISD::BITREVERSE, MVT::v16i16,  3 },
+    { ISD::BITREVERSE, MVT::v16i8,   3 },
+    { ISD::BITREVERSE, MVT::v32i8,   3 },
     { ISD::BSWAP,      MVT::v4i64,   1 },
     { ISD::BSWAP,      MVT::v8i32,   1 },
     { ISD::BSWAP,      MVT::v16i16,  1 },
-    { ISD::CTLZ,       MVT::v4i64,  23 },
-    { ISD::CTLZ,       MVT::v8i32,  18 },
-    { ISD::CTLZ,       MVT::v16i16, 14 },
-    { ISD::CTLZ,       MVT::v32i8,   9 },
-    { ISD::CTPOP,      MVT::v4i64,   7 },
-    { ISD::CTPOP,      MVT::v8i32,  11 },
-    { ISD::CTPOP,      MVT::v16i16,  9 },
-    { ISD::CTPOP,      MVT::v32i8,   6 },
-    { ISD::CTTZ,       MVT::v4i64,  10 },
-    { ISD::CTTZ,       MVT::v8i32,  14 },
-    { ISD::CTTZ,       MVT::v16i16, 12 },
-    { ISD::CTTZ,       MVT::v32i8,   9 },
+    { ISD::CTLZ,       MVT::v2i64,   7 },
+    { ISD::CTLZ,       MVT::v4i64,   7 },
+    { ISD::CTLZ,       MVT::v4i32,   5 },
+    { ISD::CTLZ,       MVT::v8i32,   5 },
+    { ISD::CTLZ,       MVT::v8i16,   4 },
+    { ISD::CTLZ,       MVT::v16i16,  4 },
+    { ISD::CTLZ,       MVT::v16i8,   3 },
+    { ISD::CTLZ,       MVT::v32i8,   3 },
+    { ISD::CTPOP,      MVT::v2i64,   3 },
+    { ISD::CTPOP,      MVT::v4i64,   3 },
+    { ISD::CTPOP,      MVT::v4i32,   7 },
+    { ISD::CTPOP,      MVT::v8i32,   7 },
+    { ISD::CTPOP,      MVT::v8i16,   3 },
+    { ISD::CTPOP,      MVT::v16i16,  3 },
+    { ISD::CTPOP,      MVT::v16i8,   2 },
+    { ISD::CTPOP,      MVT::v32i8,   2 },
+    { ISD::CTTZ,       MVT::v2i64,   4 },
+    { ISD::CTTZ,       MVT::v4i64,   4 },
+    { ISD::CTTZ,       MVT::v4i32,   7 },
+    { ISD::CTTZ,       MVT::v8i32,   7 },
+    { ISD::CTTZ,       MVT::v8i16,   4 },
+    { ISD::CTTZ,       MVT::v16i16,  4 },
+    { ISD::CTTZ,       MVT::v16i8,   3 },
+    { ISD::CTTZ,       MVT::v32i8,   3 },
     { ISD::SADDSAT,    MVT::v16i16,  1 },
     { ISD::SADDSAT,    MVT::v32i8,   1 },
     { ISD::SMAX,       MVT::v8i32,   1 },

diff  --git a/llvm/test/Analysis/CostModel/X86/bitreverse.ll b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
index 745da07da101d..cb95b36e33af0 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=X86,SSE2
 ; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=X86,SSE42
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=X86,AVX,AVX1
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=X86,AVX,AVX2
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=X86,AVX1
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=X86,AVX2
 ; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=X86,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,avx512bw,avx512dq | FileCheck %s -check-prefixes=X86,AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=X64,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=X64,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=X64,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=X64,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=X64,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=X64,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=X64,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=X64,AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+xop | FileCheck %s -check-prefixes=XOP
@@ -200,12 +200,16 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
-; AVX-LABEL: 'var_bitreverse_v2i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
+; AVX1-LABEL: 'var_bitreverse_v2i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_v2i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v2i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v2i64'
@@ -250,11 +254,11 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v4i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v4i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v4i64'
@@ -299,7 +303,7 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v8i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; AVX512F-LABEL: 'var_bitreverse_v8i64'
@@ -307,7 +311,7 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; AVX512BW-LABEL: 'var_bitreverse_v8i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v8i64'
@@ -347,12 +351,16 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
-; AVX-LABEL: 'var_bitreverse_v4i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
+; AVX1-LABEL: 'var_bitreverse_v4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_v4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v4i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v4i32'
@@ -397,11 +405,11 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v8i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v8i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v8i32'
@@ -446,7 +454,7 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v16i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; AVX512F-LABEL: 'var_bitreverse_v16i32'
@@ -454,7 +462,7 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; AVX512BW-LABEL: 'var_bitreverse_v16i32'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v16i32'
@@ -494,12 +502,16 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
-; AVX-LABEL: 'var_bitreverse_v8i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
+; AVX1-LABEL: 'var_bitreverse_v8i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_v8i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v8i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v8i16'
@@ -544,11 +556,11 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v16i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v16i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v16i16'
@@ -593,7 +605,7 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v32i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; AVX512F-LABEL: 'var_bitreverse_v32i16'
@@ -601,7 +613,7 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; AVX512BW-LABEL: 'var_bitreverse_v32i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v32i16'
@@ -641,12 +653,16 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
 ;
-; AVX-LABEL: 'var_bitreverse_v16i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
+; AVX1-LABEL: 'var_bitreverse_v16i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
+;
+; AVX2-LABEL: 'var_bitreverse_v16i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v16i8'
@@ -691,11 +707,11 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v32i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
 ;
 ; AVX512-LABEL: 'var_bitreverse_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v32i8'
@@ -740,7 +756,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; AVX2-LABEL: 'var_bitreverse_v64i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; AVX512F-LABEL: 'var_bitreverse_v64i8'
@@ -748,7 +764,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; AVX512BW-LABEL: 'var_bitreverse_v64i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; XOP-LABEL: 'var_bitreverse_v64i8'

diff  --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index a897e3783eb53..29772397ab87b 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -145,18 +145,25 @@ define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v2i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
+; AVX1-LABEL: 'var_ctlz_v2i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v2i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v2i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v2i64'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v2i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 0)
   ret <2 x i64> %ctlz
 }
@@ -170,18 +177,25 @@ define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v2i64u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
+; AVX1-LABEL: 'var_ctlz_v2i64u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v2i64u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v2i64u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v2i64u'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v2i64u'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 1)
   ret <2 x i64> %ctlz
 }
@@ -200,11 +214,11 @@ define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v4i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v4i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v4i64'
@@ -229,11 +243,11 @@ define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v4i64u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v4i64u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v4i64u'
@@ -258,7 +272,7 @@ define <8 x i64> @var_ctlz_v8i64(<8 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v8i64'
@@ -291,7 +305,7 @@ define <8 x i64> @var_ctlz_v8i64u(<8 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i64u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v8i64u'
@@ -319,18 +333,25 @@ define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v4i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
+; AVX1-LABEL: 'var_ctlz_v4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v4i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v4i32'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 0)
   ret <4 x i32> %ctlz
 }
@@ -344,18 +365,25 @@ define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v4i32u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
+; AVX1-LABEL: 'var_ctlz_v4i32u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v4i32u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v4i32u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v4i32u'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v4i32u'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 1)
   ret <4 x i32> %ctlz
 }
@@ -374,11 +402,11 @@ define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v8i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i32'
@@ -403,11 +431,11 @@ define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i32u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v8i32u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i32u'
@@ -432,7 +460,7 @@ define <16 x i32> @var_ctlz_v16i32(<16 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v16i32'
@@ -465,7 +493,7 @@ define <16 x i32> @var_ctlz_v16i32u(<16 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i32u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v16i32u'
@@ -493,18 +521,25 @@ define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v8i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
+; AVX1-LABEL: 'var_ctlz_v8i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v8i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v8i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i16'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v8i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
   %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 0)
   ret <8 x i16> %ctlz
 }
@@ -518,18 +553,25 @@ define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v8i16u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
+; AVX1-LABEL: 'var_ctlz_v8i16u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v8i16u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v8i16u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i16u'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v8i16u'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
   %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 1)
   ret <8 x i16> %ctlz
 }
@@ -548,11 +590,11 @@ define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v16i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i16'
@@ -577,11 +619,11 @@ define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i16u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v16i16u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i16u'
@@ -606,7 +648,7 @@ define <32 x i16> @var_ctlz_v32i16(<32 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v32i16'
@@ -639,7 +681,7 @@ define <32 x i16> @var_ctlz_v32i16u(<32 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i16u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v32i16u'
@@ -667,18 +709,25 @@ define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v16i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
+; AVX1-LABEL: 'var_ctlz_v16i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v16i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i8'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v16i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
   %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 0)
   ret <16 x i8> %ctlz
 }
@@ -692,18 +741,25 @@ define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
-; AVX-LABEL: 'var_ctlz_v16i8u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
+; AVX1-LABEL: 'var_ctlz_v16i8u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
+;
+; AVX2-LABEL: 'var_ctlz_v16i8u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v16i8u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i8u'
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
+; AVX-LABEL: 'var_ctlz_v16i8u'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
   %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 1)
   ret <16 x i8> %ctlz
 }
@@ -722,11 +778,11 @@ define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v32i8'
@@ -751,11 +807,11 @@ define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i8u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v32i8u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v32i8u'
@@ -780,7 +836,7 @@ define <64 x i8> @var_ctlz_v64i8(<64 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v64i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v64i8'
@@ -813,7 +869,7 @@ define <64 x i8> @var_ctlz_v64i8u(<64 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v64i8u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v64i8u'

diff  --git a/llvm/test/Analysis/CostModel/X86/ctpop.ll b/llvm/test/Analysis/CostModel/X86/ctpop.ll
index a7837d93b837d..0227b03eee86d 100644
--- a/llvm/test/Analysis/CostModel/X86/ctpop.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctpop.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=-popcnt,+sse2 | FileCheck %s -check-prefixes=SSE2,NOPOPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+sse2 | FileCheck %s -check-prefixes=SSE2,POPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+sse4.2 | FileCheck %s -check-prefixes=POPCNT,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx | FileCheck %s -check-prefixes=POPCNT,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx2 | FileCheck %s -check-prefixes=POPCNT,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx | FileCheck %s -check-prefixes=POPCNT,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx2 | FileCheck %s -check-prefixes=POPCNT,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512f | FileCheck %s -check-prefixes=POPCNT,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=POPCNT,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=POPCNT,AVX512VPOPCNT
@@ -94,16 +94,20 @@ define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
-; AVX-LABEL: 'var_ctpop_v2i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
+; AVX1-LABEL: 'var_ctpop_v2i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
+;
+; AVX2-LABEL: 'var_ctpop_v2i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v2i64'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v2i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v2i64'
@@ -111,7 +115,7 @@ define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v2i64'
-; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
 ; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -132,15 +136,15 @@ define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v4i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v4i64'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v4i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v4i64'
@@ -148,7 +152,7 @@ define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v4i64'
-; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
 ; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
@@ -169,7 +173,7 @@ define <8 x i64> @var_ctpop_v8i64(<8 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v8i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctpop = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v8i64'
@@ -201,16 +205,20 @@ define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
-; AVX-LABEL: 'var_ctpop_v4i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
+; AVX1-LABEL: 'var_ctpop_v4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
+;
+; AVX2-LABEL: 'var_ctpop_v4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v4i32'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v4i32'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v4i32'
@@ -218,7 +226,7 @@ define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v4i32'
-; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
 ; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
@@ -239,15 +247,15 @@ define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v8i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v8i32'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v8i32'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v8i32'
@@ -255,7 +263,7 @@ define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v8i32'
-; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
 ; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
@@ -276,7 +284,7 @@ define <16 x i32> @var_ctpop_v16i32(<16 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v16i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v16i32'
@@ -308,20 +316,24 @@ define <8 x i16> @var_ctpop_v8i16(<8 x i16> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
 ;
-; AVX-LABEL: 'var_ctpop_v8i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
+; AVX1-LABEL: 'var_ctpop_v8i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
+;
+; AVX2-LABEL: 'var_ctpop_v8i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v8i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v8i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v8i16'
-; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v8i16'
@@ -346,19 +358,19 @@ define <16 x i16> @var_ctpop_v16i16(<16 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v16i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v16i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v16i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v16i16'
-; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v16i16'
@@ -383,7 +395,7 @@ define <32 x i16> @var_ctpop_v32i16(<32 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v32i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctpop = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v32i16'
@@ -415,20 +427,24 @@ define <16 x i8> @var_ctpop_v16i8(<16 x i8> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
 ;
-; AVX-LABEL: 'var_ctpop_v16i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
+; AVX1-LABEL: 'var_ctpop_v16i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
+;
+; AVX2-LABEL: 'var_ctpop_v16i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v16i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v16i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v16i8'
-; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v16i8'
@@ -453,19 +469,19 @@ define <32 x i8> @var_ctpop_v32i8(<32 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v32i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v32i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
 ;
 ; AVX512BW-LABEL: 'var_ctpop_v32i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
 ;
 ; AVX512VPOPCNT-LABEL: 'var_ctpop_v32i8'
-; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
 ; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
 ;
 ; AVX512BITALG-LABEL: 'var_ctpop_v32i8'
@@ -490,7 +506,7 @@ define <64 x i8> @var_ctpop_v64i8(<64 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctpop
 ;
 ; AVX2-LABEL: 'var_ctpop_v64i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %ctpop = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctpop
 ;
 ; AVX512F-LABEL: 'var_ctpop_v64i8'

diff  --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index c908579f7f018..b72498531158f 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=-bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,NOBMI
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,BMI
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+sse4.2 | FileCheck %s -check-prefixes=BMI,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx | FileCheck %s -check-prefixes=BMI,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=BMI,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx | FileCheck %s -check-prefixes=BMI,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=BMI,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=BMI,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
@@ -147,12 +147,16 @@ define <2 x i64> @var_cttz_v2i64(<2 x i64> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v2i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
+; AVX1-LABEL: 'var_cttz_v2i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v2i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v2i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 0)
@@ -168,12 +172,16 @@ define <2 x i64> @var_cttz_v2i64u(<2 x i64> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v2i64u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
+; AVX1-LABEL: 'var_cttz_v2i64u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v2i64u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v2i64u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz
 ;
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 1)
@@ -194,11 +202,11 @@ define <4 x i64> @var_cttz_v4i64(<4 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v4i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v4i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 0)
@@ -219,11 +227,11 @@ define <4 x i64> @var_cttz_v4i64u(<4 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v4i64u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v4i64u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz
 ;
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 1)
@@ -244,7 +252,7 @@ define <8 x i64> @var_cttz_v8i64(<8 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v8i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v8i64'
@@ -281,7 +289,7 @@ define <8 x i64> @var_cttz_v8i64u(<8 x i64> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v8i64u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v8i64u'
@@ -313,12 +321,16 @@ define <4 x i32> @var_cttz_v4i32(<4 x i32> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v4i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
+; AVX1-LABEL: 'var_cttz_v4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v4i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 0)
@@ -334,12 +346,16 @@ define <4 x i32> @var_cttz_v4i32u(<4 x i32> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v4i32u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
+; AVX1-LABEL: 'var_cttz_v4i32u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v4i32u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v4i32u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz
 ;
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 1)
@@ -360,11 +376,11 @@ define <8 x i32> @var_cttz_v8i32(<8 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v8i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v8i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 0)
@@ -385,11 +401,11 @@ define <8 x i32> @var_cttz_v8i32u(<8 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v8i32u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v8i32u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz
 ;
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 1)
@@ -410,7 +426,7 @@ define <16 x i32> @var_cttz_v16i32(<16 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v16i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v16i32'
@@ -447,7 +463,7 @@ define <16 x i32> @var_cttz_v16i32u(<16 x i32> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v16i32u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v16i32u'
@@ -479,12 +495,16 @@ define <8 x i16> @var_cttz_v8i16(<8 x i16> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v8i16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
+; AVX1-LABEL: 'var_cttz_v8i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v8i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v8i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 0)
@@ -500,12 +520,16 @@ define <8 x i16> @var_cttz_v8i16u(<8 x i16> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v8i16u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
+; AVX1-LABEL: 'var_cttz_v8i16u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v8i16u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v8i16u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz
 ;
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 1)
@@ -526,11 +550,11 @@ define <16 x i16> @var_cttz_v16i16(<16 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v16i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v16i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 0)
@@ -551,11 +575,11 @@ define <16 x i16> @var_cttz_v16i16u(<16 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v16i16u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v16i16u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz
 ;
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 1)
@@ -576,7 +600,7 @@ define <32 x i16> @var_cttz_v32i16(<32 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v32i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v32i16'
@@ -613,7 +637,7 @@ define <32 x i16> @var_cttz_v32i16u(<32 x i16> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v32i16u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v32i16u'
@@ -645,12 +669,16 @@ define <16 x i8> @var_cttz_v16i8(<16 x i8> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v16i8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
+; AVX1-LABEL: 'var_cttz_v16i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v16i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 0)
@@ -666,12 +694,16 @@ define <16 x i8> @var_cttz_v16i8u(<16 x i8> %a) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
-; AVX-LABEL: 'var_cttz_v16i8u'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
+; AVX1-LABEL: 'var_cttz_v16i8u'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
+;
+; AVX2-LABEL: 'var_cttz_v16i8u'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v16i8u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz
 ;
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 1)
@@ -692,11 +724,11 @@ define <32 x i8> @var_cttz_v32i8(<32 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v32i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 0)
@@ -717,11 +749,11 @@ define <32 x i8> @var_cttz_v32i8u(<32 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v32i8u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
 ; AVX512-LABEL: 'var_cttz_v32i8u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz
 ;
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 1)
@@ -742,7 +774,7 @@ define <64 x i8> @var_cttz_v64i8(<64 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v64i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v64i8'
@@ -779,7 +811,7 @@ define <64 x i8> @var_cttz_v64i8u(<64 x i8> %a) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
 ;
 ; AVX2-LABEL: 'var_cttz_v64i8u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
 ;
 ; AVX512F-LABEL: 'var_cttz_v64i8u'

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index fef682d4b9bda..490263a396986 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -44,20 +44,41 @@ define void @ctlz_2i64() #0 {
 }
 
 define void @ctlz_4i64() #0 {
-; CHECK-LABEL: @ctlz_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
-; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
-; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
-; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
-; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @ctlz_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
+; SSE-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; SSE-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; SSE-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; SSE-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctlz_4i64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
+; AVX1-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; AVX1-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; AVX1-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; AVX1-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctlz_4i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX2-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -96,26 +117,11 @@ define void @ctlz_4i32() #0 {
 ; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE42-NEXT:    ret void
 ;
-; AVX1-LABEL: @ctlz_4i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX1-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @ctlz_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; AVX2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @ctlz_4i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
@@ -500,20 +506,41 @@ define void @ctlz_undef_2i64() #0 {
 }
 
 define void @ctlz_undef_4i64() #0 {
-; CHECK-LABEL: @ctlz_undef_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
-; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
-; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
-; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
-; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @ctlz_undef_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
+; SSE-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; SSE-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; SSE-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; SSE-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctlz_undef_4i64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
+; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
+; AVX1-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
+; AVX1-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
+; AVX1-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; AVX1-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; AVX1-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; AVX1-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctlz_undef_4i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 true)
+; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX2-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -552,26 +579,11 @@ define void @ctlz_undef_4i32() #0 {
 ; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE42-NEXT:    ret void
 ;
-; AVX1-LABEL: @ctlz_undef_4i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; AVX1-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @ctlz_undef_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
-; AVX2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
-; AVX2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
-; AVX2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
-; AVX2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @ctlz_undef_4i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
index 8dfcfbf285da1..973512922ea03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
@@ -36,14 +36,20 @@ define void @ctpop_2i64() #0 {
 ; SSE42-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
 ; SSE42-NEXT:    ret void
 ;
-; AVX-LABEL: @ctpop_2i64(
-; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
-; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
-; AVX-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
-; AVX-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
-; AVX-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @ctpop_2i64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
+; AVX1-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
+; AVX1-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctpop_2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
+; AVX2-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; AVX2-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
@@ -137,20 +143,26 @@ define void @ctpop_4i32() #0 {
 ; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
 ; SSE42-NEXT:    ret void
 ;
-; AVX-LABEL: @ctpop_4i32(
-; AVX-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
-; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
-; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
-; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
-; AVX-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @ctpop_4i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; AVX1-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; AVX1-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; AVX1-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; AVX1-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX1-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctpop_4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
+; AVX2-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; AVX2-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
index eb37546f07a05..7cbc4b1b88530 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -44,20 +44,41 @@ define void @cttz_2i64() #0 {
 }
 
 define void @cttz_4i64() #0 {
-; CHECK-LABEL: @cttz_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
-; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
-; CHECK-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false)
-; CHECK-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false)
-; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @cttz_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false)
+; SSE-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; SSE-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; SSE-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; SSE-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @cttz_4i64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX1-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false)
+; AVX1-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; AVX1-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; AVX1-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; AVX1-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @cttz_4i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX2-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -96,26 +117,11 @@ define void @cttz_4i32() #0 {
 ; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE42-NEXT:    ret void
 ;
-; AVX1-LABEL: @cttz_4i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX1-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @cttz_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
-; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
-; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
-; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
-; AVX2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @cttz_4i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
@@ -500,20 +506,41 @@ define void @cttz_undef_2i64() #0 {
 }
 
 define void @cttz_undef_4i64() #0 {
-; CHECK-LABEL: @cttz_undef_4i64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
-; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
-; CHECK-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true)
-; CHECK-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true)
-; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; CHECK-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @cttz_undef_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
+; SSE-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
+; SSE-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true)
+; SSE-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true)
+; SSE-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; SSE-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; SSE-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; SSE-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @cttz_undef_4i64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX1-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
+; AVX1-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
+; AVX1-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true)
+; AVX1-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true)
+; AVX1-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; AVX1-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; AVX1-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; AVX1-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @cttz_undef_4i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> [[TMP1]], i1 true)
+; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX2-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -552,26 +579,11 @@ define void @cttz_undef_4i32() #0 {
 ; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE42-NEXT:    ret void
 ;
-; AVX1-LABEL: @cttz_undef_4i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; AVX1-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @cttz_undef_4i32(
-; AVX2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; AVX2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; AVX2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; AVX2-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
-; AVX2-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
-; AVX2-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
-; AVX2-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
-; AVX2-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX2-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX2-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX2-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @cttz_undef_4i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4


        


More information about the llvm-commits mailing list