[llvm] [RFC][X86] Allow speculative BSR/BSF instructions on targets with CMOV (PR #102885)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 20 08:17:37 PDT 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/102885
>From cc0e78a69336f84812a1d6ea9f3f95a80e8b8abe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 9 Aug 2024 18:21:51 +0100
Subject: [PATCH] [X86] Allow speculative BSR/BSF instructions on targets with
CMOV
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +-
.../lib/Target/X86/X86TargetTransformInfo.cpp | 10 +-
.../Analysis/CostModel/X86/ctlz-codesize.ll | 8 +-
.../CostModel/X86/ctlz-sizelatency.ll | 8 +-
llvm/test/Analysis/CostModel/X86/ctlz.ll | 8 +-
.../Analysis/CostModel/X86/cttz-codesize.ll | 2 +-
.../CostModel/X86/cttz-sizelatency.ll | 2 +-
llvm/test/CodeGen/X86/atomic-bit-test.ll | 1 -
llvm/test/CodeGen/X86/bit_ceil.ll | 53 +--
llvm/test/CodeGen/X86/combine-or.ll | 47 ++-
llvm/test/CodeGen/X86/ctlo.ll | 161 ++++++----
llvm/test/CodeGen/X86/ctlz.ll | 304 +++++++++---------
llvm/test/CodeGen/X86/cttz.ll | 37 ++-
llvm/test/CodeGen/X86/known-never-zero.ll | 269 +++++-----------
llvm/test/CodeGen/X86/lzcnt-cmp.ll | 52 +--
llvm/test/CodeGen/X86/pr57673.ll | 50 +--
llvm/test/CodeGen/X86/pr89877.ll | 8 +-
llvm/test/CodeGen/X86/pr92569.ll | 16 +-
.../CodeGenPrepare/X86/cttz-ctlz.ll | 80 ++---
.../test/Transforms/SLPVectorizer/X86/ctlz.ll | 78 ++++-
20 files changed, 516 insertions(+), 682 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 169c955f0ba89f..068904b33ef53c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3239,14 +3239,14 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
// Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
- return Subtarget.hasBMI() ||
+ return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
(!Ty->isVectorTy() &&
Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
}
bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
// Speculate ctlz only if we can directly use LZCNT.
- return Subtarget.hasLZCNT();
+ return Subtarget.hasLZCNT() || Subtarget.canUseCMOV();
}
bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 9a11c33386fd0b..cb9ee64a677a7e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4210,9 +4210,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
{ ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
- { ISD::CTLZ, MVT::i64, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
- { ISD::CTTZ, MVT::i64, { 2, 2, 5, 5 } }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
{ ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
{ ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
@@ -4241,9 +4241,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
{ ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
{ ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
- { ISD::CTLZ, MVT::i32, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
- { ISD::CTLZ, MVT::i16, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
- { ISD::CTLZ, MVT::i8, { 3, 2, 7, 7 } }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
{ ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
{ ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
index ae0f1a3cfad307..da0f71c63ef80e 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
index 8c6c3228d8fc6e..2425e7286265b0 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index 99e682b8e17826..fa7982ce09e9ce 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 1d40debb7ab816..07bf1dd7a2ff6c 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index 351e863f132067..afe5cb8c55fe65 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll
index f39c4b5e620d0e..10b6605c3fb05e 100644
--- a/llvm/test/CodeGen/X86/atomic-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll
@@ -582,7 +582,6 @@ define i32 @split_hoist_and(i32 %0) nounwind {
; X64-NEXT: lock btsl $3, v32(%rip)
; X64-NEXT: setb %al
; X64-NEXT: shll $3, %eax
-; X64-NEXT: testl %edi, %edi
; X64-NEXT: retq
%2 = atomicrmw or ptr @v32, i32 8 monotonic, align 4
%3 = tail call i32 @llvm.ctlz.i32(i32 %0, i1 false)
diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll
index 4641c114238f8f..823453087f6180 100644
--- a/llvm/test/CodeGen/X86/bit_ceil.ll
+++ b/llvm/test/CodeGen/X86/bit_ceil.ll
@@ -8,16 +8,12 @@
define i32 @bit_ceil_i32(i32 %x) {
; NOBMI-LABEL: bit_ceil_i32:
; NOBMI: # %bb.0:
-; NOBMI-NEXT: movl %edi, %eax
-; NOBMI-NEXT: decl %eax
-; NOBMI-NEXT: je .LBB0_1
-; NOBMI-NEXT: # %bb.2: # %cond.false
-; NOBMI-NEXT: bsrl %eax, %ecx
+; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
+; NOBMI-NEXT: leal -1(%rdi), %eax
+; NOBMI-NEXT: bsrl %eax, %eax
+; NOBMI-NEXT: movl $63, %ecx
+; NOBMI-NEXT: cmovnel %eax, %ecx
; NOBMI-NEXT: xorl $31, %ecx
-; NOBMI-NEXT: jmp .LBB0_3
-; NOBMI-NEXT: .LBB0_1:
-; NOBMI-NEXT: movl $32, %ecx
-; NOBMI-NEXT: .LBB0_3: # %cond.end
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
; NOBMI-NEXT: movl $1, %eax
@@ -51,15 +47,10 @@ define i32 @bit_ceil_i32(i32 %x) {
define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
; NOBMI-LABEL: bit_ceil_i32_plus1:
; NOBMI: # %bb.0: # %entry
-; NOBMI-NEXT: testl %edi, %edi
-; NOBMI-NEXT: je .LBB1_1
-; NOBMI-NEXT: # %bb.2: # %cond.false
-; NOBMI-NEXT: bsrl %edi, %ecx
+; NOBMI-NEXT: bsrl %edi, %eax
+; NOBMI-NEXT: movl $63, %ecx
+; NOBMI-NEXT: cmovnel %eax, %ecx
; NOBMI-NEXT: xorl $31, %ecx
-; NOBMI-NEXT: jmp .LBB1_3
-; NOBMI-NEXT: .LBB1_1:
-; NOBMI-NEXT: movl $32, %ecx
-; NOBMI-NEXT: .LBB1_3: # %cond.end
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
; NOBMI-NEXT: movl $1, %eax
@@ -94,16 +85,11 @@ entry:
define i64 @bit_ceil_i64(i64 %x) {
; NOBMI-LABEL: bit_ceil_i64:
; NOBMI: # %bb.0:
-; NOBMI-NEXT: movq %rdi, %rax
-; NOBMI-NEXT: decq %rax
-; NOBMI-NEXT: je .LBB2_1
-; NOBMI-NEXT: # %bb.2: # %cond.false
-; NOBMI-NEXT: bsrq %rax, %rcx
-; NOBMI-NEXT: xorq $63, %rcx
-; NOBMI-NEXT: jmp .LBB2_3
-; NOBMI-NEXT: .LBB2_1:
-; NOBMI-NEXT: movl $64, %ecx
-; NOBMI-NEXT: .LBB2_3: # %cond.end
+; NOBMI-NEXT: leaq -1(%rdi), %rax
+; NOBMI-NEXT: bsrq %rax, %rax
+; NOBMI-NEXT: movl $127, %ecx
+; NOBMI-NEXT: cmovneq %rax, %rcx
+; NOBMI-NEXT: xorl $63, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
; NOBMI-NEXT: movl $1, %eax
@@ -136,15 +122,10 @@ define i64 @bit_ceil_i64(i64 %x) {
define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
; NOBMI-LABEL: bit_ceil_i64_plus1:
; NOBMI: # %bb.0: # %entry
-; NOBMI-NEXT: testq %rdi, %rdi
-; NOBMI-NEXT: je .LBB3_1
-; NOBMI-NEXT: # %bb.2: # %cond.false
-; NOBMI-NEXT: bsrq %rdi, %rcx
-; NOBMI-NEXT: xorq $63, %rcx
-; NOBMI-NEXT: jmp .LBB3_3
-; NOBMI-NEXT: .LBB3_1:
-; NOBMI-NEXT: movl $64, %ecx
-; NOBMI-NEXT: .LBB3_3: # %cond.end
+; NOBMI-NEXT: bsrq %rdi, %rax
+; NOBMI-NEXT: movl $127, %ecx
+; NOBMI-NEXT: cmovneq %rax, %rcx
+; NOBMI-NEXT: xorl $63, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
; NOBMI-NEXT: movl $1, %eax
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 3b2102f46a297a..4060355495eb3b 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -213,21 +213,18 @@ define i64 @PR89533(<64 x i8> %a0) {
; SSE-NEXT: shll $16, %ecx
; SSE-NEXT: orl %eax, %ecx
; SSE-NEXT: pcmpeqb %xmm4, %xmm2
-; SSE-NEXT: pmovmskb %xmm2, %edx
-; SSE-NEXT: xorl $65535, %edx # imm = 0xFFFF
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: xorl $65535, %eax # imm = 0xFFFF
; SSE-NEXT: pcmpeqb %xmm4, %xmm3
-; SSE-NEXT: pmovmskb %xmm3, %eax
-; SSE-NEXT: notl %eax
-; SSE-NEXT: shll $16, %eax
-; SSE-NEXT: orl %edx, %eax
-; SSE-NEXT: shlq $32, %rax
-; SSE-NEXT: orq %rcx, %rax
-; SSE-NEXT: je .LBB11_2
-; SSE-NEXT: # %bb.1: # %cond.false
-; SSE-NEXT: rep bsfq %rax, %rax
-; SSE-NEXT: retq
-; SSE-NEXT: .LBB11_2: # %cond.end
+; SSE-NEXT: pmovmskb %xmm3, %edx
+; SSE-NEXT: notl %edx
+; SSE-NEXT: shll $16, %edx
+; SSE-NEXT: orl %eax, %edx
+; SSE-NEXT: shlq $32, %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: bsfq %rdx, %rcx
; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: cmovneq %rcx, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: PR89533:
@@ -243,23 +240,19 @@ define i64 @PR89533(<64 x i8> %a0) {
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %edx
-; AVX1-NEXT: xorl $65535, %edx # imm = 0xFFFF
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: notl %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: orl %edx, %eax
-; AVX1-NEXT: shlq $32, %rax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: je .LBB11_2
-; AVX1-NEXT: # %bb.1: # %cond.false
-; AVX1-NEXT: rep bsfq %rax, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB11_2: # %cond.end
+; AVX1-NEXT: vpmovmskb %xmm0, %edx
+; AVX1-NEXT: notl %edx
+; AVX1-NEXT: shll $16, %edx
+; AVX1-NEXT: orl %eax, %edx
+; AVX1-NEXT: shlq $32, %rdx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: bsfq %rdx, %rcx
; AVX1-NEXT: movl $64, %eax
+; AVX1-NEXT: cmovneq %rcx, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index bb80279e28f3d3..f383c9a2544fca 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -13,36 +13,44 @@ declare i32 @llvm.ctlz.i32(i32, i1)
declare i64 @llvm.ctlz.i64(i64, i1)
define i8 @ctlo_i8(i8 %x) {
-; X86-LABEL: ctlo_i8:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorb $-1, %al
-; X86-NEXT: je .LBB0_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: bsrl %eax, %eax
-; X86-NEXT: xorl $7, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB0_1:
-; X86-NEXT: movb $8, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlo_i8:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: xorb $-1, %al
+; X86-NOCMOV-NEXT: je .LBB0_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: movzbl %al, %eax
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
+; X86-NOCMOV-NEXT: xorl $7, %eax
+; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB0_1:
+; X86-NOCMOV-NEXT: movb $8, %al
+; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlo_i8:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT: notb %al
+; X86-CMOV-NEXT: movzbl %al, %eax
+; X86-CMOV-NEXT: bsrl %eax, %ecx
+; X86-CMOV-NEXT: movl $15, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: xorl $7, %eax
+; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlo_i8:
; X64: # %bb.0:
-; X64-NEXT: xorb $-1, %dil
-; X64-NEXT: je .LBB0_1
-; X64-NEXT: # %bb.2: # %cond.false
+; X64-NEXT: notb %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsrl %eax, %eax
+; X64-NEXT: bsrl %eax, %ecx
+; X64-NEXT: movl $15, %eax
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: xorl $7, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
-; X64-NEXT: .LBB0_1:
-; X64-NEXT: movb $8, %al
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlo_i8:
; X86-CLZ: # %bb.0:
@@ -111,34 +119,41 @@ define i8 @ctlo_i8_undef(i8 %x) {
}
define i16 @ctlo_i16(i16 %x) {
-; X86-LABEL: ctlo_i16:
-; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorw $-1, %ax
-; X86-NEXT: je .LBB2_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsrw %ax, %ax
-; X86-NEXT: xorl $15, %eax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB2_1:
-; X86-NEXT: movw $16, %ax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlo_i16:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: xorw $-1, %ax
+; X86-NOCMOV-NEXT: je .LBB2_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: bsrw %ax, %ax
+; X86-NOCMOV-NEXT: xorl $15, %eax
+; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB2_1:
+; X86-NOCMOV-NEXT: movw $16, %ax
+; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlo_i16:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT: notl %eax
+; X86-CMOV-NEXT: bsrw %ax, %cx
+; X86-CMOV-NEXT: movw $31, %ax
+; X86-CMOV-NEXT: cmovnew %cx, %ax
+; X86-CMOV-NEXT: xorl $15, %eax
+; X86-CMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlo_i16:
; X64: # %bb.0:
-; X64-NEXT: xorw $-1, %di
-; X64-NEXT: je .LBB2_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrw %di, %ax
+; X64-NEXT: notl %edi
+; X64-NEXT: bsrw %di, %cx
+; X64-NEXT: movw $31, %ax
+; X64-NEXT: cmovnew %cx, %ax
; X64-NEXT: xorl $15, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
-; X64-NEXT: .LBB2_1:
-; X64-NEXT: movw $16, %ax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlo_i16:
; X86-CLZ: # %bb.0:
@@ -193,30 +208,37 @@ define i16 @ctlo_i16_undef(i16 %x) {
}
define i32 @ctlo_i32(i32 %x) {
-; X86-LABEL: ctlo_i32:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorl $-1, %eax
-; X86-NEXT: je .LBB4_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsrl %eax, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB4_1:
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlo_i32:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: xorl $-1, %eax
+; X86-NOCMOV-NEXT: je .LBB4_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB4_1:
+; X86-NOCMOV-NEXT: movl $32, %eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlo_i32:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT: notl %eax
+; X86-CMOV-NEXT: bsrl %eax, %ecx
+; X86-CMOV-NEXT: movl $63, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: xorl $31, %eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlo_i32:
; X64: # %bb.0:
-; X64-NEXT: xorl $-1, %edi
-; X64-NEXT: je .LBB4_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: notl %edi
+; X64-NEXT: bsrl %edi, %ecx
+; X64-NEXT: movl $63, %eax
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: retq
-; X64-NEXT: .LBB4_1:
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlo_i32:
; X86-CLZ: # %bb.0:
@@ -314,15 +336,12 @@ define i64 @ctlo_i64(i64 %x) {
;
; X64-LABEL: ctlo_i64:
; X64: # %bb.0:
-; X64-NEXT: xorq $-1, %rdi
-; X64-NEXT: je .LBB6_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrq %rdi, %rax
+; X64-NEXT: notq %rdi
+; X64-NEXT: bsrq %rdi, %rcx
+; X64-NEXT: movl $127, %eax
+; X64-NEXT: cmovneq %rcx, %rax
; X64-NEXT: xorq $63, %rax
; X64-NEXT: retq
-; X64-NEXT: .LBB6_1:
-; X64-NEXT: movl $64, %eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlo_i64:
; X86-CLZ: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index d8f83502bd849a..6635be18b0f7a7 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -218,36 +218,41 @@ define i64 @ctlz_i64(i64 %x) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i8 @ctlz_i8_zero_test(i8 %n) {
-; X86-LABEL: ctlz_i8_zero_test:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testb %al, %al
-; X86-NEXT: je .LBB4_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: bsrl %eax, %eax
-; X86-NEXT: xorl $7, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB4_1:
-; X86-NEXT: movb $8, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlz_i8_zero_test:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: testb %al, %al
+; X86-NOCMOV-NEXT: je .LBB4_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: movzbl %al, %eax
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
+; X86-NOCMOV-NEXT: xorl $7, %eax
+; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB4_1:
+; X86-NOCMOV-NEXT: movb $8, %al
+; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlz_i8_zero_test:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT: bsrl %eax, %ecx
+; X86-CMOV-NEXT: movl $15, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: xorl $7, %eax
+; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlz_i8_zero_test:
; X64: # %bb.0:
-; X64-NEXT: testb %dil, %dil
-; X64-NEXT: je .LBB4_1
-; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsrl %eax, %eax
+; X64-NEXT: bsrl %eax, %ecx
+; X64-NEXT: movl $15, %eax
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: xorl $7, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
-; X64-NEXT: .LBB4_1:
-; X64-NEXT: movb $8, %al
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_i8_zero_test:
; X86-CLZ: # %bb.0:
@@ -286,34 +291,38 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i16 @ctlz_i16_zero_test(i16 %n) {
-; X86-LABEL: ctlz_i16_zero_test:
-; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testw %ax, %ax
-; X86-NEXT: je .LBB5_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsrw %ax, %ax
-; X86-NEXT: xorl $15, %eax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB5_1:
-; X86-NEXT: movw $16, %ax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlz_i16_zero_test:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: testw %ax, %ax
+; X86-NOCMOV-NEXT: je .LBB5_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: bsrw %ax, %ax
+; X86-NOCMOV-NEXT: xorl $15, %eax
+; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB5_1:
+; X86-NOCMOV-NEXT: movw $16, %ax
+; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlz_i16_zero_test:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: bsrw {{[0-9]+}}(%esp), %cx
+; X86-CMOV-NEXT: movw $31, %ax
+; X86-CMOV-NEXT: cmovnew %cx, %ax
+; X86-CMOV-NEXT: xorl $15, %eax
+; X86-CMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlz_i16_zero_test:
; X64: # %bb.0:
-; X64-NEXT: testw %di, %di
-; X64-NEXT: je .LBB5_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrw %di, %ax
+; X64-NEXT: bsrw %di, %cx
+; X64-NEXT: movw $31, %ax
+; X64-NEXT: cmovnew %cx, %ax
; X64-NEXT: xorl $15, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
-; X64-NEXT: .LBB5_1:
-; X64-NEXT: movw $16, %ax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_i16_zero_test:
; X86-CLZ: # %bb.0:
@@ -340,30 +349,34 @@ define i16 @ctlz_i16_zero_test(i16 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i32 @ctlz_i32_zero_test(i32 %n) {
-; X86-LABEL: ctlz_i32_zero_test:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB6_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsrl %eax, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB6_1:
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlz_i32_zero_test:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: testl %eax, %eax
+; X86-NOCMOV-NEXT: je .LBB6_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB6_1:
+; X86-NOCMOV-NEXT: movl $32, %eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlz_i32_zero_test:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT: movl $63, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: xorl $31, %eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlz_i32_zero_test:
; X64: # %bb.0:
-; X64-NEXT: testl %edi, %edi
-; X64-NEXT: je .LBB6_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: bsrl %edi, %ecx
+; X64-NEXT: movl $63, %eax
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: retq
-; X64-NEXT: .LBB6_1:
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_i32_zero_test:
; X86-CLZ: # %bb.0:
@@ -429,15 +442,11 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
;
; X64-LABEL: ctlz_i64_zero_test:
; X64: # %bb.0:
-; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: je .LBB7_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrq %rdi, %rax
+; X64-NEXT: bsrq %rdi, %rcx
+; X64-NEXT: movl $127, %eax
+; X64-NEXT: cmovneq %rcx, %rax
; X64-NEXT: xorq $63, %rax
; X64-NEXT: retq
-; X64-NEXT: .LBB7_1:
-; X64-NEXT: movl $64, %eax
-; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_i64_zero_test:
; X86-CLZ: # %bb.0:
@@ -580,33 +589,33 @@ define i32 @ctlz_bsr(i32 %n) {
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
; codegen doesn't know how to combine the $32 and $31 into $63.
define i32 @ctlz_bsr_zero_test(i32 %n) {
-; X86-LABEL: ctlz_bsr_zero_test:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB10_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsrl %eax, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB10_1:
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlz_bsr_zero_test:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: testl %eax, %eax
+; X86-NOCMOV-NEXT: je .LBB10_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB10_1:
+; X86-NOCMOV-NEXT: movl $32, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlz_bsr_zero_test:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT: movl $63, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlz_bsr_zero_test:
; X64: # %bb.0:
-; X64-NEXT: testl %edi, %edi
-; X64-NEXT: je .LBB10_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrl %edi, %eax
-; X64-NEXT: xorl $31, %eax
-; X64-NEXT: xorl $31, %eax
-; X64-NEXT: retq
-; X64-NEXT: .LBB10_1:
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: xorl $31, %eax
+; X64-NEXT: bsrl %edi, %ecx
+; X64-NEXT: movl $63, %eax
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_bsr_zero_test:
@@ -945,38 +954,39 @@ define i8 @ctlz_xor7_i8_true(i8 %x) {
}
define i8 @ctlz_xor7_i8_false(i8 %x) {
-; X86-LABEL: ctlz_xor7_i8_false:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testb %al, %al
-; X86-NEXT: je .LBB16_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: bsrl %eax, %eax
-; X86-NEXT: xorl $7, %eax
-; X86-NEXT: xorb $7, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB16_1:
-; X86-NEXT: movb $8, %al
-; X86-NEXT: xorb $7, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlz_xor7_i8_false:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: testb %al, %al
+; X86-NOCMOV-NEXT: je .LBB16_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: movzbl %al, %eax
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
+; X86-NOCMOV-NEXT: xorl $7, %eax
+; X86-NOCMOV-NEXT: xorb $7, %al
+; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB16_1:
+; X86-NOCMOV-NEXT: movb $8, %al
+; X86-NOCMOV-NEXT: xorb $7, %al
+; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlz_xor7_i8_false:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT: bsrl %eax, %ecx
+; X86-CMOV-NEXT: movl $15, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlz_xor7_i8_false:
; X64: # %bb.0:
-; X64-NEXT: testb %dil, %dil
-; X64-NEXT: je .LBB16_1
-; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsrl %eax, %eax
-; X64-NEXT: xorl $7, %eax
-; X64-NEXT: xorb $7, %al
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-; X64-NEXT: .LBB16_1:
-; X64-NEXT: movb $8, %al
-; X64-NEXT: xorb $7, %al
+; X64-NEXT: bsrl %eax, %ecx
+; X64-NEXT: movl $15, %eax
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
@@ -1060,33 +1070,33 @@ define i16 @ctlz_xor15_i16_true(i16 %x) {
}
define i32 @ctlz_xor31_i32_false(i32 %x) {
-; X86-LABEL: ctlz_xor31_i32_false:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB18_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: bsrl %eax, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB18_1:
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: ctlz_xor31_i32_false:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: testl %eax, %eax
+; X86-NOCMOV-NEXT: je .LBB18_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB18_1:
+; X86-NOCMOV-NEXT: movl $32, %eax
+; X86-NOCMOV-NEXT: xorl $31, %eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: ctlz_xor31_i32_false:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT: movl $63, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: ctlz_xor31_i32_false:
; X64: # %bb.0:
-; X64-NEXT: testl %edi, %edi
-; X64-NEXT: je .LBB18_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: bsrl %edi, %eax
-; X64-NEXT: xorl $31, %eax
-; X64-NEXT: xorl $31, %eax
-; X64-NEXT: retq
-; X64-NEXT: .LBB18_1:
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: xorl $31, %eax
+; X64-NEXT: bsrl %edi, %ecx
+; X64-NEXT: movl $63, %eax
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_xor31_i32_false:
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index b35a1b72fcb6f1..27f229b18bf057 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -303,17 +303,24 @@ define i16 @cttz_i16_zero_test(i16 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i32 @cttz_i32_zero_test(i32 %n) {
-; X86-LABEL: cttz_i32_zero_test:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB6_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB6_1:
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: retl
+; X86-NOCMOV-LABEL: cttz_i32_zero_test:
+; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT: testl %eax, %eax
+; X86-NOCMOV-NEXT: je .LBB6_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: rep bsfl %eax, %eax
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB6_1:
+; X86-NOCMOV-NEXT: movl $32, %eax
+; X86-NOCMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: cttz_i32_zero_test:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT: movl $32, %eax
+; X86-CMOV-NEXT: cmovnel %ecx, %eax
+; X86-CMOV-NEXT: retl
;
; X64-LABEL: cttz_i32_zero_test:
; X64: # %bb.0:
@@ -386,13 +393,9 @@ define i64 @cttz_i64_zero_test(i64 %n) {
;
; X64-LABEL: cttz_i64_zero_test:
; X64: # %bb.0:
-; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: je .LBB7_1
-; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfq %rdi, %rax
-; X64-NEXT: retq
-; X64-NEXT: .LBB7_1:
+; X64-NEXT: bsfq %rdi, %rcx
; X64-NEXT: movl $64, %eax
+; X64-NEXT: cmovneq %rcx, %rax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i64_zero_test:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index d5d604a138a719..ac41a3fe6bb7e4 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -44,12 +44,9 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: je .LBB1_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB1_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: or_maybe_zero:
@@ -94,18 +91,14 @@ define i32 @select_known_nonzero(i1 %c, i32 %x) {
define i32 @select_maybe_zero(i1 %c, i32 %x) {
; X86-LABEL: select_maybe_zero:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: orl $1, %ecx
-; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl $1, %eax
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnel %ecx, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB3_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB3_1:
+; X86-NEXT: cmovnel %eax, %ecx
+; X86-NEXT: bsfl %ecx, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: select_maybe_zero:
@@ -201,13 +194,9 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB7_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB7_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: shl_maybe_zero:
@@ -251,17 +240,13 @@ define i32 @uaddsat_known_nonzero(i32 %x) {
define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) {
; X86-LABEL: uaddsat_maybe_zero:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1, %eax
-; X86-NEXT: cmovael %ecx, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB9_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB9_1:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovael %eax, %ecx
+; X86-NEXT: bsfl %ecx, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: uaddsat_maybe_zero:
@@ -314,13 +299,9 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl %eax, %ecx
; X86-NEXT: cmoval %ecx, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB11_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB11_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: umax_maybe_zero:
@@ -372,17 +353,13 @@ define i32 @umin_known_nonzero(i32 %xx, i32 %yy) {
define i32 @umin_maybe_zero(i32 %x, i32 %y) {
; X86-LABEL: umin_maybe_zero:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpl $54, %ecx
-; X86-NEXT: movl $54, %eax
-; X86-NEXT: cmovbl %ecx, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB13_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB13_1:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $54, %eax
+; X86-NEXT: movl $54, %ecx
+; X86-NEXT: cmovbl %eax, %ecx
+; X86-NEXT: bsfl %ecx, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: umin_maybe_zero:
@@ -490,17 +467,13 @@ define <4 x i32> @smin_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
define i32 @smin_maybe_zero(i32 %x, i32 %y) {
; X86-LABEL: smin_maybe_zero:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpl $54, %ecx
-; X86-NEXT: movl $54, %eax
-; X86-NEXT: cmovll %ecx, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB17_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB17_1:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl $54, %eax
+; X86-NEXT: movl $54, %ecx
+; X86-NEXT: cmovll %eax, %ecx
+; X86-NEXT: bsfl %ecx, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: smin_maybe_zero:
@@ -608,17 +581,13 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
define i32 @smax_known_zero(i32 %x, i32 %y) {
; X86-LABEL: smax_known_zero:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: testl %ecx, %ecx
-; X86-NEXT: movl $-1, %eax
-; X86-NEXT: cmovnsl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB21_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB21_1:
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovnsl %eax, %ecx
+; X86-NEXT: bsfl %ecx, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: smax_known_zero:
@@ -643,14 +612,8 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rorl %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB22_1
-; X86-NEXT: # %bb.2: # %cond.false
; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
-; X86-NEXT: .LBB22_1:
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: retl
;
; X64-LABEL: rotr_known_nonzero:
; X64: # %bb.0:
@@ -675,13 +638,9 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rorl %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB23_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB23_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: rotr_maybe_zero:
@@ -733,13 +692,9 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rorl %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB25_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB25_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: rotr_with_fshr_maybe_zero:
@@ -765,14 +720,8 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: roll %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB26_1
-; X86-NEXT: # %bb.2: # %cond.false
; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
-; X86-NEXT: .LBB26_1:
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: retl
;
; X64-LABEL: rotl_known_nonzero:
; X64: # %bb.0:
@@ -797,13 +746,9 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: roll %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB27_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB27_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: rotl_maybe_zero:
@@ -855,13 +800,9 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: roll %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB29_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB29_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: rotl_with_fshl_maybe_zero:
@@ -932,13 +873,9 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB32_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB32_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: sra_maybe_zero:
@@ -1009,13 +946,9 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB35_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB35_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: srl_maybe_zero:
@@ -1064,13 +997,9 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl {{[0-9]+}}(%esp)
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB37_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB37_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: udiv_maybe_zero:
@@ -1119,13 +1048,9 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cltd
; X86-NEXT: idivl {{[0-9]+}}(%esp)
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB39_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB39_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: sdiv_maybe_zero:
@@ -1171,12 +1096,9 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: orl $1, %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: je .LBB41_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB41_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: add_maybe_zero:
@@ -1249,16 +1171,13 @@ define i32 @sub_known_nonzero_ne_case(i32 %xx, i32 %yy) {
define i32 @sub_maybe_zero(i32 %x) {
; X86-LABEL: sub_maybe_zero:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: orl $64, %eax
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: je .LBB44_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB44_1:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: bsfl %ecx, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: sub_maybe_zero:
@@ -1280,14 +1199,11 @@ define i32 @sub_maybe_zero(i32 %x) {
define i32 @sub_maybe_zero2(i32 %x) {
; X86-LABEL: sub_maybe_zero2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: negl %eax
-; X86-NEXT: je .LBB45_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB45_1:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: sub_maybe_zero2:
@@ -1310,13 +1226,9 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) {
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB46_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB46_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: mul_known_nonzero_nsw:
@@ -1341,13 +1253,9 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) {
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB47_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB47_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: mul_known_nonzero_nuw:
@@ -1371,13 +1279,9 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB48_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB48_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: mul_maybe_zero:
@@ -1433,13 +1337,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
; X86-LABEL: bitcast_maybe_zero:
; X86: # %bb.0:
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB50_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB50_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: bitcast_maybe_zero:
@@ -1458,15 +1358,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
define i32 @bitcast_from_float(float %x) {
; X86-LABEL: bitcast_from_float:
; X86: # %bb.0:
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB51_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB51_1:
+; X86-NEXT: bsfl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: bitcast_from_float:
@@ -1511,14 +1405,9 @@ define i32 @zext_maybe_zero(i16 %x) {
; X86-LABEL: zext_maybe_zero:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testw %ax, %ax
-; X86-NEXT: je .LBB53_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB53_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: zext_maybe_zero:
@@ -1563,13 +1452,9 @@ define i32 @sext_maybe_zero(i16 %x) {
; X86-LABEL: sext_maybe_zero:
; X86: # %bb.0:
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB55_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB55_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: sext_maybe_zero:
diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
index a9513a373661f4..4f65739cc70dd1 100644
--- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
@@ -12,27 +12,11 @@ define i1 @lshr_ctlz_cmpeq_one_i64(i64 %in) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: retl
;
-; X64-BSR-LABEL: lshr_ctlz_cmpeq_one_i64:
-; X64-BSR: # %bb.0:
-; X64-BSR-NEXT: testq %rdi, %rdi
-; X64-BSR-NEXT: je .LBB0_1
-; X64-BSR-NEXT: # %bb.2: # %cond.false
-; X64-BSR-NEXT: bsrq %rdi, %rax
-; X64-BSR-NEXT: xorq $63, %rax
-; X64-BSR-NEXT: shrl $6, %eax
-; X64-BSR-NEXT: # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT: retq
-; X64-BSR-NEXT: .LBB0_1:
-; X64-BSR-NEXT: movl $64, %eax
-; X64-BSR-NEXT: shrl $6, %eax
-; X64-BSR-NEXT: # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT: retq
-;
-; X64-LZCNT-LABEL: lshr_ctlz_cmpeq_one_i64:
-; X64-LZCNT: # %bb.0:
-; X64-LZCNT-NEXT: testq %rdi, %rdi
-; X64-LZCNT-NEXT: sete %al
-; X64-LZCNT-NEXT: retq
+; X64-LABEL: lshr_ctlz_cmpeq_one_i64:
+; X64: # %bb.0:
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: sete %al
+; X64-NEXT: retq
%ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0)
%lshr = lshr i64 %ctlz, 6
%icmp = icmp eq i64 %lshr, 1
@@ -81,27 +65,11 @@ define i1 @lshr_ctlz_cmpne_zero_i64(i64 %in) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: retl
;
-; X64-BSR-LABEL: lshr_ctlz_cmpne_zero_i64:
-; X64-BSR: # %bb.0:
-; X64-BSR-NEXT: testq %rdi, %rdi
-; X64-BSR-NEXT: je .LBB2_1
-; X64-BSR-NEXT: # %bb.2: # %cond.false
-; X64-BSR-NEXT: bsrq %rdi, %rax
-; X64-BSR-NEXT: xorq $63, %rax
-; X64-BSR-NEXT: shrl $6, %eax
-; X64-BSR-NEXT: # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT: retq
-; X64-BSR-NEXT: .LBB2_1:
-; X64-BSR-NEXT: movl $64, %eax
-; X64-BSR-NEXT: shrl $6, %eax
-; X64-BSR-NEXT: # kill: def $al killed $al killed $rax
-; X64-BSR-NEXT: retq
-;
-; X64-LZCNT-LABEL: lshr_ctlz_cmpne_zero_i64:
-; X64-LZCNT: # %bb.0:
-; X64-LZCNT-NEXT: testq %rdi, %rdi
-; X64-LZCNT-NEXT: sete %al
-; X64-LZCNT-NEXT: retq
+; X64-LABEL: lshr_ctlz_cmpne_zero_i64:
+; X64: # %bb.0:
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: sete %al
+; X64-NEXT: retq
%ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0)
%lshr = lshr i64 %ctlz, 6
%icmp = icmp ne i64 %lshr, 0
diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll
index d0ae6cea068dc0..cf7717f420480b 100644
--- a/llvm/test/CodeGen/X86/pr57673.ll
+++ b/llvm/test/CodeGen/X86/pr57673.ll
@@ -24,35 +24,24 @@ define void @foo() {
; NORMAL-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit
; NORMAL-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg
; NORMAL-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF
- ; NORMAL-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF
; NORMAL-NEXT: {{ $}}
; NORMAL-NEXT: bb.1.bb_8:
- ; NORMAL-NEXT: successors: %bb.5(0x40000000), %bb.2(0x40000000)
+ ; NORMAL-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; NORMAL-NEXT: {{ $}}
; NORMAL-NEXT: TEST8rr [[COPY]], [[COPY]], implicit-def $eflags
- ; NORMAL-NEXT: JCC_1 %bb.5, 5, implicit $eflags
+ ; NORMAL-NEXT: JCC_1 %bb.3, 5, implicit $eflags
; NORMAL-NEXT: JMP_1 %bb.2
; NORMAL-NEXT: {{ $}}
; NORMAL-NEXT: bb.2.bb_mid:
- ; NORMAL-NEXT: successors: %bb.4(0x30000000), %bb.3(0x50000000)
+ ; NORMAL-NEXT: successors: %bb.3(0x80000000)
; NORMAL-NEXT: {{ $}}
- ; NORMAL-NEXT: TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags
- ; NORMAL-NEXT: JCC_1 %bb.4, 4, implicit $eflags
- ; NORMAL-NEXT: JMP_1 %bb.3
- ; NORMAL-NEXT: {{ $}}
- ; NORMAL-NEXT: bb.3.cond.false:
- ; NORMAL-NEXT: successors: %bb.4(0x80000000)
- ; NORMAL-NEXT: {{ $}}
- ; NORMAL-NEXT: bb.4.cond.end:
- ; NORMAL-NEXT: successors: %bb.5(0x80000000)
- ; NORMAL-NEXT: {{ $}}
- ; NORMAL-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
+ ; NORMAL-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8)
- ; NORMAL-NEXT: DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40), [[LEA64r]], [[LEA64r]], debug-location !8
- ; NORMAL-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
+ ; NORMAL-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8
+ ; NORMAL-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8)
; NORMAL-NEXT: {{ $}}
- ; NORMAL-NEXT: bb.5.bb_last:
+ ; NORMAL-NEXT: bb.3.bb_last:
; NORMAL-NEXT: successors: %bb.1(0x80000000)
; NORMAL-NEXT: {{ $}}
; NORMAL-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
@@ -74,35 +63,24 @@ define void @foo() {
; INSTRREF-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit
; INSTRREF-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg
; INSTRREF-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF
- ; INSTRREF-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF
; INSTRREF-NEXT: {{ $}}
; INSTRREF-NEXT: bb.1.bb_8:
- ; INSTRREF-NEXT: successors: %bb.5(0x40000000), %bb.2(0x40000000)
+ ; INSTRREF-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; INSTRREF-NEXT: {{ $}}
; INSTRREF-NEXT: TEST8rr [[COPY]], [[COPY]], implicit-def $eflags
- ; INSTRREF-NEXT: JCC_1 %bb.5, 5, implicit $eflags
+ ; INSTRREF-NEXT: JCC_1 %bb.3, 5, implicit $eflags
; INSTRREF-NEXT: JMP_1 %bb.2
; INSTRREF-NEXT: {{ $}}
; INSTRREF-NEXT: bb.2.bb_mid:
- ; INSTRREF-NEXT: successors: %bb.4(0x30000000), %bb.3(0x50000000)
- ; INSTRREF-NEXT: {{ $}}
- ; INSTRREF-NEXT: TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags
- ; INSTRREF-NEXT: JCC_1 %bb.4, 4, implicit $eflags
- ; INSTRREF-NEXT: JMP_1 %bb.3
- ; INSTRREF-NEXT: {{ $}}
- ; INSTRREF-NEXT: bb.3.cond.false:
- ; INSTRREF-NEXT: successors: %bb.4(0x80000000)
- ; INSTRREF-NEXT: {{ $}}
- ; INSTRREF-NEXT: bb.4.cond.end:
- ; INSTRREF-NEXT: successors: %bb.5(0x80000000)
+ ; INSTRREF-NEXT: successors: %bb.3(0x80000000)
; INSTRREF-NEXT: {{ $}}
- ; INSTRREF-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
+ ; INSTRREF-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8)
- ; INSTRREF-NEXT: DBG_INSTR_REF !3, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), dbg-instr-ref(1, 0), debug-location !8
- ; INSTRREF-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
+ ; INSTRREF-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8
+ ; INSTRREF-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8)
; INSTRREF-NEXT: {{ $}}
- ; INSTRREF-NEXT: bb.5.bb_last:
+ ; INSTRREF-NEXT: bb.3.bb_last:
; INSTRREF-NEXT: successors: %bb.1(0x80000000)
; INSTRREF-NEXT: {{ $}}
; INSTRREF-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll
index fdbe75b467d992..19baad26583ada 100644
--- a/llvm/test/CodeGen/X86/pr89877.ll
+++ b/llvm/test/CodeGen/X86/pr89877.ll
@@ -9,13 +9,9 @@ define i32 @sext_known_nonzero(i16 %xx) {
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: shll %cl, %eax
; X86-NEXT: cwtl
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB0_1
-; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: retl
-; X86-NEXT: .LBB0_1:
+; X86-NEXT: bsfl %eax, %ecx
; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: sext_known_nonzero:
diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll
index f91063089e3a90..0fb4ed7905287c 100644
--- a/llvm/test/CodeGen/X86/pr92569.ll
+++ b/llvm/test/CodeGen/X86/pr92569.ll
@@ -4,17 +4,13 @@
define void @PR92569(i64 %arg, <8 x i8> %arg1) {
; CHECK-LABEL: PR92569:
; CHECK: # %bb.0:
-; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: je .LBB0_1
-; CHECK-NEXT: # %bb.2: # %cond.false
-; CHECK-NEXT: rep bsfq %rdi, %rax
-; CHECK-NEXT: jmp .LBB0_3
-; CHECK-NEXT: .LBB0_1:
-; CHECK-NEXT: movl $64, %eax
-; CHECK-NEXT: .LBB0_3: # %cond.end
-; CHECK-NEXT: shrb $3, %al
+; CHECK-NEXT: bsfq %rdi, %rax
+; CHECK-NEXT: movl $64, %ecx
+; CHECK-NEXT: cmovneq %rax, %rcx
+; CHECK-NEXT: shrb $3, %cl
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: andl $15, %eax
; CHECK-NEXT: movzbl -24(%rsp,%rax), %eax
; CHECK-NEXT: movl %eax, 0
; CHECK-NEXT: retq
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
index 06909d950addb6..2c2923440bf7c2 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefix=SLOW
-; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ
-; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ
+; RUN: opt -S -passes="require<profile-summary>,function(codegenprepare)" < %s | FileCheck %s --check-prefix=SLOW
+; RUN: opt -S -passes="require<profile-summary>,function(codegenprepare)" -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ
+; RUN: opt -S -passes="require<profile-summary>,function(codegenprepare)" -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ
-; RUN: opt -S -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefix=DEBUGINFO
-; RUN: opt -S -enable-debugify -passes='require<profile-summary>,function(codegenprepare)' --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO
+; RUN: opt -S -enable-debugify -passes="require<profile-summary>,function(codegenprepare)" < %s | FileCheck %s --check-prefix=DEBUGINFO
+; RUN: opt -S -enable-debugify -passes="require<profile-summary>,function(codegenprepare)" --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO
target triple = "x86_64-unknown-unknown"
target datalayout = "e-n32:64"
@@ -16,15 +16,8 @@ target datalayout = "e-n32:64"
define i64 @cttz(i64 %A) {
; SLOW-LABEL: @cttz(
; SLOW-NEXT: entry:
-; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; SLOW: cond.false:
-; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true)
-; SLOW-NEXT: br label [[COND_END]]
-; SLOW: cond.end:
-; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; SLOW-NEXT: ret i64 [[CTZ]]
+; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false)
+; SLOW-NEXT: ret i64 [[Z]]
;
; FAST_TZ-LABEL: @cttz(
; FAST_TZ-NEXT: entry:
@@ -33,28 +26,14 @@ define i64 @cttz(i64 %A) {
;
; FAST_LZ-LABEL: @cttz(
; FAST_LZ-NEXT: entry:
-; FAST_LZ-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; FAST_LZ-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; FAST_LZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; FAST_LZ: cond.false:
-; FAST_LZ-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true)
-; FAST_LZ-NEXT: br label [[COND_END]]
-; FAST_LZ: cond.end:
-; FAST_LZ-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; FAST_LZ-NEXT: ret i64 [[CTZ]]
+; FAST_LZ-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false)
+; FAST_LZ-NEXT: ret i64 [[Z]]
;
; DEBUGINFO-LABEL: @cttz(
; DEBUGINFO-NEXT: entry:
-; DEBUGINFO-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG11:![0-9]+]]
-; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG11]]
-; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG11]]
-; DEBUGINFO: cond.false:
-; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG11]]
-; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG12:![0-9]+]]
-; DEBUGINFO: cond.end:
-; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG12]]
-; DEBUGINFO-NEXT: #dbg_value(i64 [[CTZ]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]])
-; DEBUGINFO-NEXT: ret i64 [[CTZ]], !dbg [[DBG12]]
+; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG11:![0-9]+]]
+; DEBUGINFO-NEXT: #dbg_value(i64 [[Z]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]])
+; DEBUGINFO-NEXT: ret i64 [[Z]], !dbg [[DBG12:![0-9]+]]
;
entry:
%z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
@@ -64,27 +43,13 @@ entry:
define i64 @ctlz(i64 %A) {
; SLOW-LABEL: @ctlz(
; SLOW-NEXT: entry:
-; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; SLOW: cond.false:
-; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true)
-; SLOW-NEXT: br label [[COND_END]]
-; SLOW: cond.end:
-; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; SLOW-NEXT: ret i64 [[CTZ]]
+; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false)
+; SLOW-NEXT: ret i64 [[Z]]
;
; FAST_TZ-LABEL: @ctlz(
; FAST_TZ-NEXT: entry:
-; FAST_TZ-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]]
-; FAST_TZ-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
-; FAST_TZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
-; FAST_TZ: cond.false:
-; FAST_TZ-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true)
-; FAST_TZ-NEXT: br label [[COND_END]]
-; FAST_TZ: cond.end:
-; FAST_TZ-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
-; FAST_TZ-NEXT: ret i64 [[CTZ]]
+; FAST_TZ-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false)
+; FAST_TZ-NEXT: ret i64 [[Z]]
;
; FAST_LZ-LABEL: @ctlz(
; FAST_LZ-NEXT: entry:
@@ -93,16 +58,9 @@ define i64 @ctlz(i64 %A) {
;
; DEBUGINFO-LABEL: @ctlz(
; DEBUGINFO-NEXT: entry:
-; DEBUGINFO-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG16:![0-9]+]]
-; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG16]]
-; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG16]]
-; DEBUGINFO: cond.false:
-; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG16]]
-; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG17:![0-9]+]]
-; DEBUGINFO: cond.end:
-; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG17]]
-; DEBUGINFO-NEXT: #dbg_value(i64 [[CTZ]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]])
-; DEBUGINFO-NEXT: ret i64 [[CTZ]], !dbg [[DBG17]]
+; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG16:![0-9]+]]
+; DEBUGINFO-NEXT: #dbg_value(i64 [[Z]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]])
+; DEBUGINFO-NEXT: ret i64 [[Z]], !dbg [[DBG17:![0-9]+]]
;
entry:
%z = call i64 @llvm.ctlz.i64(i64 %A, i1 false)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 0462f125955bf4..8a22e45fe1ca57 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -136,11 +136,32 @@ define void @ctlz_4i64() #0 {
}
define void @ctlz_4i32() #0 {
-; CHECK-LABEL: @ctlz_4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; CHECK-NEXT: ret void
+; SSE2-LABEL: @ctlz_4i32(
+; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; SSE2-NEXT: ret void
+;
+; SSE4-LABEL: @ctlz_4i32(
+; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
+; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; SSE4-NEXT: ret void
+;
+; AVX-LABEL: @ctlz_4i32(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX-NEXT: ret void
;
%ld0 = load i32, ptr @src32, align 4
%ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -158,14 +179,41 @@ define void @ctlz_4i32() #0 {
}
define void @ctlz_8i32() #0 {
-; SSE-LABEL: @ctlz_8i32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
-; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2
-; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
-; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SSE-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
-; SSE-NEXT: ret void
+; SSE2-LABEL: @ctlz_8i32(
+; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
+; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2
+; SSE2-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SSE2-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; SSE2-NEXT: ret void
+;
+; SSE4-LABEL: @ctlz_8i32(
+; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2
+; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; SSE4-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; SSE4-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; SSE4-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; SSE4-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE4-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; SSE4-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; SSE4-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; SSE4-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2
+; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; SSE4-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; SSE4-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; SSE4-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; SSE4-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; SSE4-NEXT: ret void
;
; AVX-LABEL: @ctlz_8i32(
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
More information about the llvm-commits
mailing list