[llvm] [CostModel][X86] Update baseline CTTZ/CTLZ costs for x86_64 (PR #124312)

Fri Jan 24 09:26:44 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

Followup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs

---

Patch is 34.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124312.diff


10 Files Affected:

- (modified) llvm/lib/Target/X86/X86TargetTransformInfo.cpp (+8-2) 
- (modified) llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll (+4-4) 
- (modified) llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll (+4-4) 
- (modified) llvm/test/Analysis/CostModel/X86/ctlz.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/X86/cttz-codesize.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll (+4-4) 
- (modified) llvm/test/Analysis/CostModel/X86/cttz.ll (+2-2) 
- (modified) llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll (+3-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll (+107-65) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/cttz.ll (+64-10) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074c..cdc2ce752743cc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4329,9 +4329,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ABS,        MVT::i64,     {  1,  2,  3,  3 } }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i64,     { 10, 12, 20, 22 } },
     { ISD::BSWAP,      MVT::i64,     {  1,  2,  1,  2 } },
-    { ISD::CTLZ,       MVT::i64,     {  2,  2,  4,  5 } }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i64,     {  1,  2,  3,  3 } }, // MOV+BSR+XOR
+    { ISD::CTLZ,       MVT::i32,     {  1,  2,  3,  3 } }, // MOV+BSR+XOR
+    { ISD::CTLZ,       MVT::i16,     {  2,  2,  3,  3 } }, // MOV+BSR+XOR
+    { ISD::CTLZ,       MVT::i8,      {  2,  2,  4,  3 } }, // MOV+BSR+XOR
     { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{  1,  2,  2,  2 } }, // BSR+XOR
-    { ISD::CTTZ,       MVT::i64,     {  2,  2,  3,  4 } }, // TEST+BSF+CMOV/BRANCH
+    { ISD::CTTZ,       MVT::i64,     {  1,  2,  2,  2 } }, // MOV+BSF
+    { ISD::CTTZ,       MVT::i32,     {  1,  2,  2,  2 } }, // MOV+BSF
+    { ISD::CTTZ,       MVT::i16,     {  2,  2,  2,  2 } }, // MOV+BSF
+    { ISD::CTTZ,       MVT::i8,      {  2,  2,  2,  2 } }, // MOV+BSF
     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  2,  1,  2 } }, // BSF
     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
     { ISD::ROTL,       MVT::i64,     {  2,  3,  1,  3 } },
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
index da0f71c63ef80e..9f8e4edf7a0fc1 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
 
 define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 
 define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
index 2425e7286265b0..fc3516695852aa 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
 
 define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 
 define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index fa7982ce09e9ce..d9d04de12467da 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 07bf1dd7a2ff6c..621c1b9320fc8d 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
@@ -40,7 +40,7 @@ define i64 @var_cttz_i64u(i64 %a) {
 
 define i32 @var_cttz_i32(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index afe5cb8c55fe65..34d363ce008795 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
 
 define i32 @var_cttz_i32(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32'
@@ -70,7 +70,7 @@ define i32 @var_cttz_i32u(i32 %a) {
 
 define i16 @var_cttz_i16(i16 %a) {
 ; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i16'
@@ -96,7 +96,7 @@ define i16 @var_cttz_i16u(i16 %a) {
 
 define i8 @var_cttz_i8(i8 %a) {
 ; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index fa0f10f886f633..3f5a731b27d9bc 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
 
 define i32 @var_cttz_i32(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 062e5f157bae25..bcef47ee9e0567 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -232,7 +232,7 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
 
 define void @cttz(i32 %a, <16 x i32> %va) {
 ; THRU-LABEL: 'cttz'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -242,12 +242,12 @@ define void @cttz(i32 %a, <16 x i32> %va) {
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'cttz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 8a22e45fe1ca57..9bf2ade3176d60 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -136,32 +136,47 @@ define void @ctlz_4i64() #0 {
 }
 
 define void @ctlz_4i32() #0 {
-; SSE2-LABEL: @ctlz_4i32(
-; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; SSE2-NEXT:    ret void
+; SSE-LABEL: @ctlz_4i32(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
+; SSE-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctlz_4i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
+; AVX1-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT:    ret void
 ;
-; SSE4-LABEL: @ctlz_4i32(
-; SSE4-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
-; SSE4-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
-; SSE4-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
-; SSE4-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
-; SSE4-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE4-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE4-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE4-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE4-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
-; SSE4-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
-; SSE4-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
-; SSE4-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
-; SSE4-NEXT:    ret void
+; AVX2-LABEL: @ctlz_4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX2-NEXT:    ret void
 ;
-; AVX-LABEL: @ctlz_4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; AVX-NEXT:    ret void
+; AVX512-LABEL: @ctlz_4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX512-NEXT:    ret void
 ;
   %ld0 = load i32, ptr @src32, align 4
   %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -179,47 +194,71 @@ define void @ctlz_4i32() #0 {
 }
 
 define void @ctlz_8i32() #0 {
-; SSE2-LABEL: @ctlz_8i32(
-; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
-; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT:    stor...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/124312