[llvm] [CostModel][X86] Update baseline CTTZ/CTLZ costs for x86_64 (PR #124312)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 09:26:12 PST 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/124312
Followup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs
>From 96a7660b22a4d5cabb022ddbb8b2d2fbd5653b72 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 24 Jan 2025 17:19:31 +0000
Subject: [PATCH] [CostModel][X86] Update baseline CTTZ/CTLZ costs for x86_64
Followup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs
---
.../lib/Target/X86/X86TargetTransformInfo.cpp | 10 +-
.../Analysis/CostModel/X86/ctlz-codesize.ll | 8 +-
.../CostModel/X86/ctlz-sizelatency.ll | 8 +-
llvm/test/Analysis/CostModel/X86/ctlz.ll | 4 +-
.../Analysis/CostModel/X86/cttz-codesize.ll | 4 +-
.../CostModel/X86/cttz-sizelatency.ll | 8 +-
llvm/test/Analysis/CostModel/X86/cttz.ll | 4 +-
.../CostModel/X86/intrinsic-cost-kinds.ll | 6 +-
.../test/Transforms/SLPVectorizer/X86/ctlz.ll | 172 +++++++++++-------
.../test/Transforms/SLPVectorizer/X86/cttz.ll | 74 +++++++-
10 files changed, 200 insertions(+), 98 deletions(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074c..cdc2ce752743cc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4329,9 +4329,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
{ ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
- { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
{ ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
- { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
{ ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
{ ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
{ ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
index da0f71c63ef80e..9f8e4edf7a0fc1 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
index 2425e7286265b0..fc3516695852aa 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index fa7982ce09e9ce..d9d04de12467da 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 07bf1dd7a2ff6c..621c1b9320fc8d 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -40,7 +40,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index afe5cb8c55fe65..34d363ce008795 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
@@ -70,7 +70,7 @@ define i32 @var_cttz_i32u(i32 %a) {
define i16 @var_cttz_i16(i16 %a) {
; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %cttz
;
; BMI-LABEL: 'var_cttz_i16'
@@ -96,7 +96,7 @@ define i16 @var_cttz_i16u(i16 %a) {
define i8 @var_cttz_i8(i8 %a) {
; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %cttz
;
; BMI-LABEL: 'var_cttz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index fa0f10f886f633..3f5a731b27d9bc 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 062e5f157bae25..bcef47ee9e0567 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -232,7 +232,7 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
define void @cttz(i32 %a, <16 x i32> %va) {
; THRU-LABEL: 'cttz'
-; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; THRU-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
@@ -242,12 +242,12 @@ define void @cttz(i32 %a, <16 x i32> %va) {
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'cttz'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 8a22e45fe1ca57..9bf2ade3176d60 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -136,32 +136,47 @@ define void @ctlz_4i64() #0 {
}
define void @ctlz_4i32() #0 {
-; SSE2-LABEL: @ctlz_4i32(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; SSE2-NEXT: ret void
+; SSE-LABEL: @ctlz_4i32(
+; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
+; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @ctlz_4i32(
+; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
+; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT: ret void
;
-; SSE4-LABEL: @ctlz_4i32(
-; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
-; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
-; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
-; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
-; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
-; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
-; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
-; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
-; SSE4-NEXT: ret void
+; AVX2-LABEL: @ctlz_4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX2-NEXT: ret void
;
-; AVX-LABEL: @ctlz_4i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; AVX-NEXT: ret void
+; AVX512-LABEL: @ctlz_4i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX512-NEXT: ret void
;
%ld0 = load i32, ptr @src32, align 4
%ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -179,47 +194,71 @@ define void @ctlz_4i32() #0 {
}
define void @ctlz_8i32() #0 {
-; SSE2-LABEL: @ctlz_8i32(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
-; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2
-; SSE2-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
-; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SSE2-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
-; SSE2-NEXT: ret void
+; SSE-LABEL: @ctlz_8i32(
+; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2
+; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; SSE-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; SSE-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2
+; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; SSE-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; SSE-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; SSE-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; SSE-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @ctlz_8i32(
+; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2
+; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; AVX1-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; AVX1-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; AVX1-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; AVX1-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2
+; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT: ret void
;
-; SSE4-LABEL: @ctlz_8i32(
-; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2
-; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
-; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
-; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
-; SSE4-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
-; SSE4-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
-; SSE4-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
-; SSE4-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
-; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE4-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
-; SSE4-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
-; SSE4-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
-; SSE4-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
-; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2
-; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
-; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
-; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
-; SSE4-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
-; SSE4-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
-; SSE4-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
-; SSE4-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
-; SSE4-NEXT: ret void
+; AVX2-LABEL: @ctlz_8i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX2-NEXT: ret void
;
-; AVX-LABEL: @ctlz_8i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
-; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
-; AVX-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2
-; AVX-NEXT: ret void
+; AVX512-LABEL: @ctlz_8i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX512-NEXT: ret void
;
%ld0 = load i32, ptr @src32, align 2
%ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
@@ -1063,3 +1102,6 @@ define void @ctlz_undef_32i8() #0 {
}
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE2: {{.*}}
+; SSE4: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
index 22f0c3f936509b..896be6f2fe2135 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -142,11 +142,32 @@ define void @cttz_4i32() #0 {
; SSE-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
; SSE-NEXT: ret void
;
-; AVX-LABEL: @cttz_4i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; AVX-NEXT: ret void
+; AVX1-LABEL: @cttz_4i32(
+; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT: store i32 [[CTTZ0]], ptr @dst32, align 4
+; AVX1-NEXT: store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT: store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @cttz_4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX2-NEXT: ret void
+;
+; AVX512-LABEL: @cttz_4i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX512-NEXT: ret void
;
%ld0 = load i32, ptr @src32, align 4
%ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -191,11 +212,44 @@ define void @cttz_8i32() #0 {
; SSE-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
; SSE-NEXT: ret void
;
-; AVX-LABEL: @cttz_8i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
-; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false)
-; AVX-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2
-; AVX-NEXT: ret void
+; AVX1-LABEL: @cttz_8i32(
+; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2
+; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; AVX1-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; AVX1-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; AVX1-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; AVX1-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false)
+; AVX1-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false)
+; AVX1-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false)
+; AVX1-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false)
+; AVX1-NEXT: store i32 [[CTTZ0]], ptr @dst32, align 2
+; AVX1-NEXT: store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT: store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT: store i32 [[CTTZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT: store i32 [[CTTZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT: store i32 [[CTTZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @cttz_8i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX2-NEXT: ret void
+;
+; AVX512-LABEL: @cttz_8i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX512-NEXT: ret void
;
%ld0 = load i32, ptr @src32, align 2
%ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
More information about the llvm-commits
mailing list