[llvm] 23cb1c4 - [CostModel][X86] Update throughput costs for CTLZ ops

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 16 08:57:05 PDT 2022


Author: Simon Pilgrim
Date: 2022-09-16T16:56:49+01:00
New Revision: 23cb1c42cd2008169193d7d531b5861d8acbe819

URL: https://github.com/llvm/llvm-project/commit/23cb1c42cd2008169193d7d531b5861d8acbe819
DIFF: https://github.com/llvm/llvm-project/commit/23cb1c42cd2008169193d7d531b5861d8acbe819.diff

LOG: [CostModel][X86] Update throughput costs for CTLZ ops

This was achieved with an updated version of the 'cost-tables vs llvm-mca' script D103695 (and recent fixes to the bdver2 + alderlake models)

Adding full CostKinds costs are affecting some other tests as they make assumptions about SizeLatency costs, so they need addressing first

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/ctlz.ll
    llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
    llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 133c933b633a8..61f51bd6501d8 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3332,16 +3332,16 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   static const CostKindTblEntry AVX512CDCostTbl[] = {
     { ISD::CTLZ,       MVT::v8i64,   {  1 } },
     { ISD::CTLZ,       MVT::v16i32,  {  1 } },
-    { ISD::CTLZ,       MVT::v32i16,  {  8 } },
-    { ISD::CTLZ,       MVT::v64i8,   { 20 } },
+    { ISD::CTLZ,       MVT::v32i16,  { 18 } },
+    { ISD::CTLZ,       MVT::v64i8,   {  3 } },
     { ISD::CTLZ,       MVT::v4i64,   {  1 } },
     { ISD::CTLZ,       MVT::v8i32,   {  1 } },
-    { ISD::CTLZ,       MVT::v16i16,  {  4 } },
-    { ISD::CTLZ,       MVT::v32i8,   { 10 } },
+    { ISD::CTLZ,       MVT::v16i16,  {  8 } },
+    { ISD::CTLZ,       MVT::v32i8,   {  2 } },
     { ISD::CTLZ,       MVT::v2i64,   {  1 } },
     { ISD::CTLZ,       MVT::v4i32,   {  1 } },
-    { ISD::CTLZ,       MVT::v8i16,   {  4 } },
-    { ISD::CTLZ,       MVT::v16i8,   {  4 } },
+    { ISD::CTLZ,       MVT::v8i16,   {  3 } },
+    { ISD::CTLZ,       MVT::v16i8,   {  2 } },
   };
   static const CostKindTblEntry AVX512BWCostTbl[] = {
     { ISD::ABS,        MVT::v32i16,  {  1,  1,  1,  1 } },
@@ -3353,10 +3353,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BSWAP,      MVT::v8i64,   {  1 } },
     { ISD::BSWAP,      MVT::v16i32,  {  1 } },
     { ISD::BSWAP,      MVT::v32i16,  {  1 } },
-    { ISD::CTLZ,       MVT::v8i64,   { 23 } },
-    { ISD::CTLZ,       MVT::v16i32,  { 22 } },
-    { ISD::CTLZ,       MVT::v32i16,  { 18 } },
-    { ISD::CTLZ,       MVT::v64i8,   { 17 } },
+    { ISD::CTLZ,       MVT::v8i64,   {  8 } },
+    { ISD::CTLZ,       MVT::v16i32,  {  8 } },
+    { ISD::CTLZ,       MVT::v32i16,  {  4 } },
+    { ISD::CTLZ,       MVT::v64i8,   {  3 } },
     { ISD::CTPOP,      MVT::v2i64,   {  3,  7, 10, 10 } },
     { ISD::CTPOP,      MVT::v4i64,   {  3,  7, 10, 10 } },
     { ISD::CTPOP,      MVT::v8i64,   {  3,  8, 10, 12 } },
@@ -3419,10 +3419,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BSWAP,      MVT::v8i64,   {  4 } },
     { ISD::BSWAP,      MVT::v16i32,  {  4 } },
     { ISD::BSWAP,      MVT::v32i16,  {  4 } },
-    { ISD::CTLZ,       MVT::v8i64,   { 29 } },
-    { ISD::CTLZ,       MVT::v16i32,  { 35 } },
-    { ISD::CTLZ,       MVT::v32i16,  { 28 } },
-    { ISD::CTLZ,       MVT::v64i8,   { 18 } },
+    { ISD::CTLZ,       MVT::v8i64,   { 10 } },
+    { ISD::CTLZ,       MVT::v16i32,  { 11 } },
+    { ISD::CTLZ,       MVT::v32i16,  {  8 } },
+    { ISD::CTLZ,       MVT::v64i8,   {  5 } },
     { ISD::CTPOP,      MVT::v8i64,   { 16, 16, 19, 19 } },
     { ISD::CTPOP,      MVT::v16i32,  { 24, 19, 27, 27 } },
     { ISD::CTPOP,      MVT::v32i16,  { 18, 15, 22, 22 } },
@@ -3552,13 +3552,13 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BSWAP,      MVT::v8i32,   {  1 } },
     { ISD::BSWAP,      MVT::v16i16,  {  1 } },
     { ISD::CTLZ,       MVT::v2i64,   {  7 } },
-    { ISD::CTLZ,       MVT::v4i64,   {  7 } },
+    { ISD::CTLZ,       MVT::v4i64,   { 14 } },
     { ISD::CTLZ,       MVT::v4i32,   {  5 } },
-    { ISD::CTLZ,       MVT::v8i32,   {  5 } },
+    { ISD::CTLZ,       MVT::v8i32,   { 10 } },
     { ISD::CTLZ,       MVT::v8i16,   {  4 } },
-    { ISD::CTLZ,       MVT::v16i16,  {  4 } },
+    { ISD::CTLZ,       MVT::v16i16,  {  6 } },
     { ISD::CTLZ,       MVT::v16i8,   {  3 } },
-    { ISD::CTLZ,       MVT::v32i8,   {  3 } },
+    { ISD::CTLZ,       MVT::v32i8,   {  4 } },
     { ISD::CTPOP,      MVT::v2i64,   {  3,  9, 10, 10 } },
     { ISD::CTPOP,      MVT::v4i64,   {  4,  9, 10, 14 } },
     { ISD::CTPOP,      MVT::v4i32,   {  7, 12, 14, 14 } },
@@ -3618,10 +3618,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BSWAP,      MVT::v4i64,   {  4 } },
     { ISD::BSWAP,      MVT::v8i32,   {  4 } },
     { ISD::BSWAP,      MVT::v16i16,  {  4 } },
-    { ISD::CTLZ,       MVT::v4i64,   { 48 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::CTLZ,       MVT::v8i32,   { 38 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::CTLZ,       MVT::v16i16,  { 30 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::CTLZ,       MVT::v32i8,   { 20 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v4i64,   { 29 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v2i64,   { 14 } },
+    { ISD::CTLZ,       MVT::v8i32,   { 24 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v4i32,   { 12 } },
+    { ISD::CTLZ,       MVT::v16i16,  { 19 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v8i16,   {  9 } },
+    { ISD::CTLZ,       MVT::v32i8,   { 14 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v16i8,   {  7 } },
     { ISD::CTPOP,      MVT::v4i64,   { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
     { ISD::CTPOP,      MVT::v2i64,   {  7, 14, 10, 14 } },
     { ISD::CTPOP,      MVT::v8i32,   { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
@@ -3709,10 +3713,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BSWAP,      MVT::v2i64,   {  1 } },
     { ISD::BSWAP,      MVT::v4i32,   {  1 } },
     { ISD::BSWAP,      MVT::v8i16,   {  1 } },
-    { ISD::CTLZ,       MVT::v2i64,   { 23 } },
-    { ISD::CTLZ,       MVT::v4i32,   { 18 } },
-    { ISD::CTLZ,       MVT::v8i16,   { 14 } },
-    { ISD::CTLZ,       MVT::v16i8,   {  9 } },
+    { ISD::CTLZ,       MVT::v2i64,   { 18 } },
+    { ISD::CTLZ,       MVT::v4i32,   { 15 } },
+    { ISD::CTLZ,       MVT::v8i16,   { 13 } },
+    { ISD::CTLZ,       MVT::v16i8,   { 11 } },
     { ISD::CTPOP,      MVT::v2i64,   { 13, 19, 12, 18 } },
     { ISD::CTPOP,      MVT::v4i32,   { 18, 24, 16, 22 } },
     { ISD::CTPOP,      MVT::v8i16,   { 13, 18, 14, 20 } },
@@ -3734,10 +3738,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BSWAP,      MVT::v2i64,   {  7 } },
     { ISD::BSWAP,      MVT::v4i32,   {  7 } },
     { ISD::BSWAP,      MVT::v8i16,   {  7 } },
-    { ISD::CTLZ,       MVT::v2i64,   { 25 } },
-    { ISD::CTLZ,       MVT::v4i32,   { 26 } },
-    { ISD::CTLZ,       MVT::v8i16,   { 20 } },
-    { ISD::CTLZ,       MVT::v16i8,   { 17 } },
+    { ISD::CTLZ,       MVT::v2i64,   { 10 } },
+    { ISD::CTLZ,       MVT::v4i32,   { 10 } },
+    { ISD::CTLZ,       MVT::v8i16,   {  9 } },
+    { ISD::CTLZ,       MVT::v16i8,   {  8 } },
     { ISD::CTPOP,      MVT::v2i64,   { 12, 26, 16, 18 } },
     { ISD::CTPOP,      MVT::v4i32,   { 15, 29, 21, 23 } },
     { ISD::CTPOP,      MVT::v8i16,   { 13, 25, 18, 20 } },
@@ -3784,8 +3788,8 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   };
   static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
     { ISD::CTLZ,       MVT::i32,     {  1 } },
-    { ISD::CTLZ,       MVT::i16,     {  1 } },
-    { ISD::CTLZ,       MVT::i8,      {  1 } },
+    { ISD::CTLZ,       MVT::i16,     {  2 } },
+    { ISD::CTLZ,       MVT::i8,      {  2 } },
   };
   static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
     { ISD::CTPOP,      MVT::i64,     {  1, 1, 1, 1 } }, // popcnt
@@ -3801,7 +3805,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BSWAP,      MVT::i64,     {  1 } },
     { ISD::CTLZ,       MVT::i64,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTTZ,       MVT::i64,     {  3 } }, // TEST+BSF+CMOV/BRANCH
-    { ISD::CTPOP,      MVT::i64,     { 10, 6, 19, 19 } },
+    { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
     { ISD::ROTL,       MVT::i64,     {  1 } },
     { ISD::ROTR,       MVT::i64,     {  1 } },
     { ISD::FSHL,       MVT::i64,     {  4 } },

diff  --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index e43249f2163f4..8090415e78c00 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -73,7 +73,7 @@ define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
-; LZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; LZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; LZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
 ;
   %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 0)
@@ -86,7 +86,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16u'
-; LZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 true)
+; LZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 true)
 ; LZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
 ;
   %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 1)
@@ -99,7 +99,7 @@ define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
-; LZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; LZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; LZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
 ;
   %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 0)
@@ -112,7 +112,7 @@ define i8 @var_ctlz_i8u(i8 %a) {
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8u'
-; LZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 true)
+; LZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 true)
 ; LZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
 ;
   %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 1)
@@ -138,15 +138,15 @@ declare <64 x i8> @llvm.ctlz.v64i8(<64 x i8>, i1)
 
 define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
 ; SSE2-LABEL: 'var_ctlz_v2i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v2i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v2i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v2i64'
@@ -170,15 +170,15 @@ define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
 
 define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
 ; SSE2-LABEL: 'var_ctlz_v2i64u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v2i64u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v2i64u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v2i64u'
@@ -202,23 +202,23 @@ define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
 
 define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
 ; SSE2-LABEL: 'var_ctlz_v4i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v4i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v4i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v4i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v4i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v4i64'
@@ -231,23 +231,23 @@ define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
 
 define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
 ; SSE2-LABEL: 'var_ctlz_v4i64u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v4i64u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v4i64u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v4i64u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v4i64u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v4i64u'
@@ -260,27 +260,27 @@ define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
 
 define <8 x i64> @var_ctlz_v8i64(<8 x i64> %a) {
 ; SSE2-LABEL: 'var_ctlz_v8i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v8i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v8i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v8i64'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v8i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i64'
@@ -293,27 +293,27 @@ define <8 x i64> @var_ctlz_v8i64(<8 x i64> %a) {
 
 define <8 x i64> @var_ctlz_v8i64u(<8 x i64> %a) {
 ; SSE2-LABEL: 'var_ctlz_v8i64u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v8i64u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v8i64u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i64u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v8i64u'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v8i64u'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i64u'
@@ -326,15 +326,15 @@ define <8 x i64> @var_ctlz_v8i64u(<8 x i64> %a) {
 
 define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
 ; SSE2-LABEL: 'var_ctlz_v4i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v4i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v4i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v4i32'
@@ -358,15 +358,15 @@ define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
 
 define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
 ; SSE2-LABEL: 'var_ctlz_v4i32u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v4i32u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v4i32u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v4i32u'
@@ -390,23 +390,23 @@ define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
 
 define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
 ; SSE2-LABEL: 'var_ctlz_v8i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v8i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v8i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v8i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i32'
@@ -419,23 +419,23 @@ define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
 
 define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
 ; SSE2-LABEL: 'var_ctlz_v8i32u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v8i32u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v8i32u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i32u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v8i32u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i32u'
@@ -448,27 +448,27 @@ define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
 
 define <16 x i32> @var_ctlz_v16i32(<16 x i32> %a) {
 ; SSE2-LABEL: 'var_ctlz_v16i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v16i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v16i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v16i32'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v16i32'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i32'
@@ -481,27 +481,27 @@ define <16 x i32> @var_ctlz_v16i32(<16 x i32> %a) {
 
 define <16 x i32> @var_ctlz_v16i32u(<16 x i32> %a) {
 ; SSE2-LABEL: 'var_ctlz_v16i32u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v16i32u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v16i32u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i32u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v16i32u'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v16i32u'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i32u'
@@ -514,15 +514,15 @@ define <16 x i32> @var_ctlz_v16i32u(<16 x i32> %a) {
 
 define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
 ; SSE2-LABEL: 'var_ctlz_v8i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v8i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v8i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i16'
@@ -534,7 +534,7 @@ define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i16'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX-LABEL: 'var_ctlz_v8i16'
@@ -546,15 +546,15 @@ define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
 
 define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
 ; SSE2-LABEL: 'var_ctlz_v8i16u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v8i16u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v8i16u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v8i16u'
@@ -566,7 +566,7 @@ define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v8i16u'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz
 ;
 ; AVX-LABEL: 'var_ctlz_v8i16u'
@@ -578,27 +578,27 @@ define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
 
 define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
 ; SSE2-LABEL: 'var_ctlz_v16i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v16i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v16i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v16i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i16'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
   %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 0)
@@ -607,27 +607,27 @@ define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
 
 define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
 ; SSE2-LABEL: 'var_ctlz_v16i16u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v16i16u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v16i16u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i16u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v16i16u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i16u'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz
 ;
   %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 1)
@@ -636,31 +636,31 @@ define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
 
 define <32 x i16> @var_ctlz_v32i16(<32 x i16> %a) {
 ; SSE2-LABEL: 'var_ctlz_v32i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v32i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v32i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v32i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v32i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v32i16'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
   %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 0)
@@ -669,31 +669,31 @@ define <32 x i16> @var_ctlz_v32i16(<32 x i16> %a) {
 
 define <32 x i16> @var_ctlz_v32i16u(<32 x i16> %a) {
 ; SSE2-LABEL: 'var_ctlz_v32i16u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v32i16u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v32i16u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i16u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v32i16u'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v32i16u'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v32i16u'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctlz
 ;
   %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 1)
@@ -702,15 +702,15 @@ define <32 x i16> @var_ctlz_v32i16u(<32 x i16> %a) {
 
 define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
 ; SSE2-LABEL: 'var_ctlz_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v16i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v16i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i8'
@@ -722,7 +722,7 @@ define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i8'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX-LABEL: 'var_ctlz_v16i8'
@@ -734,15 +734,15 @@ define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
 
 define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
 ; SSE2-LABEL: 'var_ctlz_v16i8u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v16i8u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v16i8u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v16i8u'
@@ -754,7 +754,7 @@ define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v16i8u'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz
 ;
 ; AVX-LABEL: 'var_ctlz_v16i8u'
@@ -766,27 +766,27 @@ define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
 
 define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
 ; SSE2-LABEL: 'var_ctlz_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v32i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v32i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v32i8'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
   %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 0)
@@ -795,27 +795,27 @@ define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
 
 define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
 ; SSE2-LABEL: 'var_ctlz_v32i8u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v32i8u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v32i8u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v32i8u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512-LABEL: 'var_ctlz_v32i8u'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v32i8u'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz
 ;
   %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 1)
@@ -824,31 +824,31 @@ define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
 
 define <64 x i8> @var_ctlz_v64i8(<64 x i8> %a) {
 ; SSE2-LABEL: 'var_ctlz_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v64i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v64i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v64i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v64i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v64i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v64i8'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 false)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
   %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 0)
@@ -857,31 +857,31 @@ define <64 x i8> @var_ctlz_v64i8(<64 x i8> %a) {
 
 define <64 x i8> @var_ctlz_v64i8u(<64 x i8> %a) {
 ; SSE2-LABEL: 'var_ctlz_v64i8u'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; SSE42-LABEL: 'var_ctlz_v64i8u'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX1-LABEL: 'var_ctlz_v64i8u'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX2-LABEL: 'var_ctlz_v64i8u'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512F-LABEL: 'var_ctlz_v64i8u'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512BW-LABEL: 'var_ctlz_v64i8u'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
 ; AVX512CD-LABEL: 'var_ctlz_v64i8u'
-; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
+; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 true)
 ; AVX512CD-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctlz
 ;
   %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 1)

diff  --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 45727a603155c..517abea755e4a 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -259,7 +259,7 @@ define void @cttz(i32 %a, <16 x i32> %va) {
 define void @ctlz(i32 %a, <16 x i32> %va) {
 ; THRU-LABEL: 'ctlz'
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'ctlz'

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 210ebe84d4f22..bfed230a50c1b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -44,41 +44,20 @@ define void @ctlz_2i64() #0 {
 }
 
 define void @ctlz_4i64() #0 {
-; SSE-LABEL: @ctlz_4i64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
-; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
-; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
-; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
-; SSE-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; SSE-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; SSE-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; SSE-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; SSE-NEXT:    ret void
-;
-; AVX1-LABEL: @ctlz_4i64(
-; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
-; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
-; AVX1-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
-; AVX1-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
-; AVX1-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; AVX1-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; AVX1-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; AVX1-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @ctlz_4i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 false)
-; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
-; AVX2-NEXT:    ret void
+; CHECK-LABEL: @ctlz_4i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
+; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
+; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
+; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -96,32 +75,11 @@ define void @ctlz_4i64() #0 {
 }
 
 define void @ctlz_4i32() #0 {
-; SSE2-LABEL: @ctlz_4i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @ctlz_4i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE42-NEXT:    ret void
-;
-; AVX-LABEL: @ctlz_4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @ctlz_4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
@@ -139,41 +97,14 @@ define void @ctlz_4i32() #0 {
 }
 
 define void @ctlz_8i32() #0 {
-; SSE2-LABEL: @ctlz_8i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE2-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
-; SSE2-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
-; SSE2-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
-; SSE2-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
-; SSE2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE2-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE2-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE2-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE2-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @ctlz_8i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    ret void
+; SSE-LABEL: @ctlz_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ctlz_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
@@ -506,41 +437,20 @@ define void @ctlz_undef_2i64() #0 {
 }
 
 define void @ctlz_undef_4i64() #0 {
-; SSE-LABEL: @ctlz_undef_4i64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
-; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
-; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
-; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
-; SSE-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; SSE-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; SSE-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; SSE-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; SSE-NEXT:    ret void
-;
-; AVX1-LABEL: @ctlz_undef_4i64(
-; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
-; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
-; AVX1-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
-; AVX1-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
-; AVX1-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; AVX1-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; AVX1-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; AVX1-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @ctlz_undef_4i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 true)
-; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
-; AVX2-NEXT:    ret void
+; CHECK-LABEL: @ctlz_undef_4i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
+; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
+; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
+; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
@@ -558,32 +468,11 @@ define void @ctlz_undef_4i64() #0 {
 }
 
 define void @ctlz_undef_4i32() #0 {
-; SSE2-LABEL: @ctlz_undef_4i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
-; SSE2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
-; SSE2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
-; SSE2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
-; SSE2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @ctlz_undef_4i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE42-NEXT:    ret void
-;
-; AVX-LABEL: @ctlz_undef_4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @ctlz_undef_4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
@@ -601,41 +490,14 @@ define void @ctlz_undef_4i32() #0 {
 }
 
 define void @ctlz_undef_8i32() #0 {
-; SSE2-LABEL: @ctlz_undef_8i32(
-; SSE2-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE2-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
-; SSE2-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
-; SSE2-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
-; SSE2-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
-; SSE2-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
-; SSE2-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
-; SSE2-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
-; SSE2-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
-; SSE2-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE2-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE2-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE2-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE2-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE2-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE2-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE2-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
-; SSE2-NEXT:    ret void
-;
-; SSE42-LABEL: @ctlz_undef_8i32(
-; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    ret void
+; SSE-LABEL: @ctlz_undef_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 true)
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ctlz_undef_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2


        


More information about the llvm-commits mailing list