[llvm] [X86] Handle BSF/BSR "zero-input pass through" behaviour (PR #123623)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 04:00:10 PST 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/123623
>From b01975b2fd39fe035f4c3be6d00a726af57d09cd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 21 Aug 2024 16:10:40 +0100
Subject: [PATCH 1/4] [X86] Handle BSF/BSF "zero-input fall through" behaviour
Intel docs have been updated to be similar to AMD and now describe BSF/BSF as not changing the destination register if the input value was zero, which allows us to support CTTZ/CTLZ zero-input cases by setting the destination to support a NumBits result (BSR is a bit messy as it has to XOR'd to create a CTLZ result). VIA/Zhaoxin x86_64 CPUs have also been confirmed to match this behaviour.
There are still some limits to this - its only supported for x86_64 capable processors (and I've only enabled it for x86_64 codegen), and there are some Intel CPUs that don't correctly zero the upper 32-bits of a pass through register when used for BSR32/BSF32 with a zero source value (i.e. the whole 64bits may get p[assed through).
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 63 ++++++--
llvm/lib/Target/X86/X86InstrCompiler.td | 12 +-
llvm/lib/Target/X86/X86InstrFragments.td | 13 +-
llvm/lib/Target/X86/X86InstrInfo.cpp | 25 ++--
llvm/lib/Target/X86/X86InstrMisc.td | 50 +++----
llvm/lib/Target/X86/X86Subtarget.h | 5 +
llvm/test/CodeGen/X86/bit_ceil.ll | 12 +-
llvm/test/CodeGen/X86/combine-or.ll | 6 +-
llvm/test/CodeGen/X86/ctlo.ll | 14 +-
llvm/test/CodeGen/X86/ctlz.ll | 31 ++--
llvm/test/CodeGen/X86/cttz.ll | 3 +-
llvm/test/CodeGen/X86/pr92569.ll | 10 +-
.../CodeGen/X86/scheduler-backtracking.ll | 140 ++++++++----------
llvm/test/TableGen/x86-fold-tables.inc | 12 +-
.../X86/BtVer2/clear-super-register-1.s | 6 +-
15 files changed, 208 insertions(+), 194 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 87f3f7984989e1..032704aede2950 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3386,15 +3386,19 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
}
bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
- // Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
+ // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
+ // i32/i64 or can rely on BSF passthrough value.
return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
+ Subtarget.hasBitScanPassThrough() ||
(!Ty->isVectorTy() &&
Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
}
bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
- // Speculate ctlz only if we can directly use LZCNT.
- return Subtarget.hasLZCNT() || Subtarget.canUseCMOV();
+ // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
+ // passthrough value.
+ return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
+ Subtarget.hasBitScanPassThrough();
}
bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
@@ -28694,11 +28698,18 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
}
+ // Check if we can safely pass a result though BSR for zero sources.
+ SDValue PassThru = DAG.getUNDEF(OpVT);
+ if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
+ !DAG.isKnownNeverZero(Op))
+ PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
+
// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
- Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
- if (Opc == ISD::CTLZ) {
+ // Skip CMOV if we're using a pass through value.
+ if (Opc == ISD::CTLZ && PassThru.isUndef()) {
// If src is zero (i.e. bsr sets ZF), returns NumBits.
SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
@@ -28721,16 +28732,22 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumBits = VT.getScalarSizeInBits();
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
+ bool NonZeroSrc = DAG.isKnownNeverZero(N0);
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");
+ // Check if we can safely pass a result though BSF for zero sources.
+ SDValue PassThru = DAG.getUNDEF(VT);
+ if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
+ PassThru = DAG.getConstant(NumBits, dl, VT);
+
// Issue a bsf (scan bits forward) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
- Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
- // If src is known never zero we can skip the CMOV.
- if (DAG.isKnownNeverZero(N0))
+ // Skip CMOV if src is never zero or we're using a pass through value.
+ if (NonZeroSrc || !PassThru.isUndef())
return Op;
// If src is zero (i.e. bsf sets ZF), returns NumBits.
@@ -38193,12 +38210,34 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = KnownBits::mul(Known, Known2);
break;
}
- case X86ISD::BSR:
- // BSR(0) is undef, but any use of BSR already accounts for non-zero inputs.
- // Similar KnownBits behaviour to CTLZ_ZERO_UNDEF.
+ case X86ISD::BSF: {
+ Known.Zero.setBitsFrom(Log2_32(BitWidth));
+
+ KnownBits Known2;
+ Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ if (Known2.isNonZero()) {
+ // If we have a known 1, its position is our upper bound.
+ unsigned PossibleTZ = Known2.countMaxTrailingZeros();
+ unsigned LowBits = llvm::bit_width(PossibleTZ);
+ Known.Zero.setBitsFrom(LowBits);
+ } else if (!Op.getOperand(0).isUndef()) {
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ Known = Known.intersectWith(Known2);
+ }
+ break;
+ }
+ case X86ISD::BSR: {
// TODO: Bound with input known bits?
Known.Zero.setBitsFrom(Log2_32(BitWidth));
+
+ if (!Op.getOperand(0).isUndef() &&
+ !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
+ KnownBits Known2;
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ Known = Known.intersectWith(Known2);
+ }
break;
+ }
case X86ISD::SETCC:
Known.Zero.setBitsFrom(1);
break;
@@ -54243,7 +54282,7 @@ static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
}
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
- Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
+ Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
if (VT == MVT::i8)
Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 7d4c5c0e10e492..9bda3fd7d951c9 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2213,12 +2213,12 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
(IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
// Bit scan instruction patterns to match explicit zero-undef behavior.
-def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
-def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
-def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
-def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
-def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
-def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr (i16 (IMPLICIT_DEF)), GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr (i32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr (i64 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm (i16 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm (i32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm (i64 (IMPLICIT_DEF)), addr:$src)>;
// When HasMOVBE is enabled it is possible to get a non-legalized
// register-register 16 bit bswap. This maps it to a ROL instruction.
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index ea7af893ce103f..8f038330cd2398 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -134,8 +134,8 @@ def SDTX86Cmpccxadd : SDTypeProfile<1, 4, [SDTCisSameAs<0, 2>,
def X86MFence : SDNode<"X86ISD::MFENCE", SDTNone, [SDNPHasChain]>;
-def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
-def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
+def X86bsf : SDNode<"X86ISD::BSF", SDTBinaryArithWithFlags>;
+def X86bsr : SDNode<"X86ISD::BSR", SDTBinaryArithWithFlags>;
def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>;
def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>;
@@ -685,8 +685,9 @@ def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying
// anything about the upper 32 bits, they're probably just qualifying a
-// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit
-// operation will zero-extend up to 64 bits.
+// CopyFromReg. FREEZE may be coming from a a truncate. BitScan fall through
+// values may not zero the upper bits correctly.
+// Any other 32-bit operation will zero-extend up to 64 bits.
def def32 : PatLeaf<(i32 GR32:$src), [{
return N->getOpcode() != ISD::TRUNCATE &&
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
@@ -694,7 +695,9 @@ def def32 : PatLeaf<(i32 GR32:$src), [{
N->getOpcode() != ISD::AssertSext &&
N->getOpcode() != ISD::AssertZext &&
N->getOpcode() != ISD::AssertAlign &&
- N->getOpcode() != ISD::FREEZE;
+ N->getOpcode() != ISD::FREEZE &&
+ !((N->getOpcode() == X86ISD::BSF|| N->getOpcode() == X86ISD::BSR) &&
+ (!N->getOperand(0).isUndef() && !isa<ConstantSDNode>(N->getOperand(0))));
}]>;
// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 30a5161bbcc502..562c76a1029a10 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5221,42 +5221,42 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
}
/// Check whether the use can be converted to remove a comparison against zero.
-static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
+static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
- return X86::COND_INVALID;
+ return std::make_pair(X86::COND_INVALID, ~0U);
CASE_ND(NEG8r)
CASE_ND(NEG16r)
CASE_ND(NEG32r)
CASE_ND(NEG64r)
- return X86::COND_AE;
+ return std::make_pair(X86::COND_AE, 1U);
case X86::LZCNT16rr:
case X86::LZCNT32rr:
case X86::LZCNT64rr:
- return X86::COND_B;
+ return std::make_pair(X86::COND_B, 1U);
case X86::POPCNT16rr:
case X86::POPCNT32rr:
case X86::POPCNT64rr:
- return X86::COND_E;
+ return std::make_pair(X86::COND_E, 1U);
case X86::TZCNT16rr:
case X86::TZCNT32rr:
case X86::TZCNT64rr:
- return X86::COND_B;
+ return std::make_pair(X86::COND_B, 1U);
case X86::BSF16rr:
case X86::BSF32rr:
case X86::BSF64rr:
case X86::BSR16rr:
case X86::BSR32rr:
case X86::BSR64rr:
- return X86::COND_E;
+ return std::make_pair(X86::COND_E, 2U);
case X86::BLSI32rr:
case X86::BLSI64rr:
- return X86::COND_AE;
+ return std::make_pair(X86::COND_AE, 1U);
case X86::BLSR32rr:
case X86::BLSR64rr:
case X86::BLSMSK32rr:
case X86::BLSMSK64rr:
- return X86::COND_B;
+ return std::make_pair(X86::COND_B, 1U);
// TODO: TBM instructions.
}
}
@@ -5337,6 +5337,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
bool ClearsOverflowFlag = false;
bool ShouldUpdateCC = false;
bool IsSwapped = false;
+ unsigned OpNo = 0;
X86::CondCode NewCC = X86::COND_INVALID;
int64_t ImmDelta = 0;
@@ -5392,9 +5393,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// ... // EFLAGS not changed
// testl %eax, %eax // <-- can be removed
if (IsCmpZero) {
- NewCC = isUseDefConvertible(Inst);
- if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
- Inst.getOperand(1).getReg() == SrcReg) {
+ std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
+ if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
+ Inst.getOperand(OpNo).getReg() == SrcReg) {
ShouldUpdateCC = true;
MI = &Inst;
break;
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 43c02c4f85844c..290d91bb2ce699 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -247,55 +247,55 @@ def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
} // Constraints = "$src = $dst", SchedRW
// Bit scan instructions.
-let Defs = [EFLAGS] in {
-def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+let Defs = [EFLAGS], Constraints = "$fallback = $dst" in {
+def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, GR16:$src))]>,
TB, OpSize16, Sched<[WriteBSF]>;
-def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, (loadi16 addr:$src)))]>,
TB, OpSize16, Sched<[WriteBSFLd]>;
-def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, GR32:$src))]>,
TB, OpSize32, Sched<[WriteBSF]>;
-def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, (loadi32 addr:$src)))]>,
TB, OpSize32, Sched<[WriteBSFLd]>;
-def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, GR64:$src))]>,
TB, Sched<[WriteBSF]>;
-def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, (loadi64 addr:$src)))]>,
TB, Sched<[WriteBSFLd]>;
-def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, GR16:$src))]>,
TB, OpSize16, Sched<[WriteBSR]>;
-def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, (loadi16 addr:$src)))]>,
TB, OpSize16, Sched<[WriteBSRLd]>;
-def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, GR32:$src))]>,
TB, OpSize32, Sched<[WriteBSR]>;
-def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, (loadi32 addr:$src)))]>,
TB, OpSize32, Sched<[WriteBSRLd]>;
-def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, GR64:$src))]>,
TB, Sched<[WriteBSR]>;
-def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, (loadi64 addr:$src)))]>,
TB, Sched<[WriteBSRLd]>;
} // Defs = [EFLAGS]
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index e3cb9ee8ce1909..c399989f115d75 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -263,6 +263,11 @@ class X86Subtarget final : public X86GenSubtargetInfo {
return hasBWI() && useAVX512Regs();
}
+ // Returns true if the destination register of a BSF/BSR instruction is
+ // not touched if the source register is zero.
+ // NOTE: i32->i64 implicit zext isn't guaranteed by BSR/BSF pass through.
+ bool hasBitScanPassThrough() const { return is64Bit(); }
+
bool isXRaySupported() const override { return is64Bit(); }
/// Use clflush if we have SSE2 or we're on x86-64 (even if we asked for
diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll
index 823453087f6180..1f21fcac8341d5 100644
--- a/llvm/test/CodeGen/X86/bit_ceil.ll
+++ b/llvm/test/CodeGen/X86/bit_ceil.ll
@@ -10,9 +10,8 @@ define i32 @bit_ceil_i32(i32 %x) {
; NOBMI: # %bb.0:
; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
; NOBMI-NEXT: leal -1(%rdi), %eax
-; NOBMI-NEXT: bsrl %eax, %eax
; NOBMI-NEXT: movl $63, %ecx
-; NOBMI-NEXT: cmovnel %eax, %ecx
+; NOBMI-NEXT: bsrl %eax, %ecx
; NOBMI-NEXT: xorl $31, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
@@ -47,9 +46,8 @@ define i32 @bit_ceil_i32(i32 %x) {
define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
; NOBMI-LABEL: bit_ceil_i32_plus1:
; NOBMI: # %bb.0: # %entry
-; NOBMI-NEXT: bsrl %edi, %eax
; NOBMI-NEXT: movl $63, %ecx
-; NOBMI-NEXT: cmovnel %eax, %ecx
+; NOBMI-NEXT: bsrl %edi, %ecx
; NOBMI-NEXT: xorl $31, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
@@ -86,9 +84,8 @@ define i64 @bit_ceil_i64(i64 %x) {
; NOBMI-LABEL: bit_ceil_i64:
; NOBMI: # %bb.0:
; NOBMI-NEXT: leaq -1(%rdi), %rax
-; NOBMI-NEXT: bsrq %rax, %rax
; NOBMI-NEXT: movl $127, %ecx
-; NOBMI-NEXT: cmovneq %rax, %rcx
+; NOBMI-NEXT: bsrq %rax, %rcx
; NOBMI-NEXT: xorl $63, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
@@ -122,9 +119,8 @@ define i64 @bit_ceil_i64(i64 %x) {
define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
; NOBMI-LABEL: bit_ceil_i64_plus1:
; NOBMI: # %bb.0: # %entry
-; NOBMI-NEXT: bsrq %rdi, %rax
; NOBMI-NEXT: movl $127, %ecx
-; NOBMI-NEXT: cmovneq %rax, %rcx
+; NOBMI-NEXT: bsrq %rdi, %rcx
; NOBMI-NEXT: xorl $63, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index d9c6d7053be746..08262e4d34b269 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -227,9 +227,8 @@ define i64 @PR89533(<64 x i8> %a0) {
; SSE-NEXT: orl %eax, %edx
; SSE-NEXT: shlq $32, %rdx
; SSE-NEXT: orq %rcx, %rdx
-; SSE-NEXT: bsfq %rdx, %rcx
; SSE-NEXT: movl $64, %eax
-; SSE-NEXT: cmovneq %rcx, %rax
+; SSE-NEXT: rep bsfq %rdx, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: PR89533:
@@ -255,9 +254,8 @@ define i64 @PR89533(<64 x i8> %a0) {
; AVX1-NEXT: orl %eax, %edx
; AVX1-NEXT: shlq $32, %rdx
; AVX1-NEXT: orq %rcx, %rdx
-; AVX1-NEXT: bsfq %rdx, %rcx
; AVX1-NEXT: movl $64, %eax
-; AVX1-NEXT: cmovneq %rcx, %rax
+; AVX1-NEXT: rep bsfq %rdx, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 2f4fef82f1f17a..fecb62fbc5aea6 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -44,10 +44,9 @@ define i8 @ctlo_i8(i8 %x) {
; X64-LABEL: ctlo_i8:
; X64: # %bb.0:
; X64-NEXT: notb %dil
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsrl %eax, %ecx
+; X64-NEXT: movzbl %dil, %ecx
; X64-NEXT: movl $15, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %ecx, %eax
; X64-NEXT: xorl $7, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -146,9 +145,8 @@ define i16 @ctlo_i16(i16 %x) {
; X64-LABEL: ctlo_i16:
; X64: # %bb.0:
; X64-NEXT: notl %edi
-; X64-NEXT: bsrw %di, %cx
; X64-NEXT: movw $31, %ax
-; X64-NEXT: cmovnew %cx, %ax
+; X64-NEXT: bsrw %di, %ax
; X64-NEXT: xorl $15, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -232,9 +230,8 @@ define i32 @ctlo_i32(i32 %x) {
; X64-LABEL: ctlo_i32:
; X64: # %bb.0:
; X64-NEXT: notl %edi
-; X64-NEXT: bsrl %edi, %ecx
; X64-NEXT: movl $63, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: retq
;
@@ -335,9 +332,8 @@ define i64 @ctlo_i64(i64 %x) {
; X64-LABEL: ctlo_i64:
; X64: # %bb.0:
; X64-NEXT: notq %rdi
-; X64-NEXT: bsrq %rdi, %rcx
; X64-NEXT: movl $127, %eax
-; X64-NEXT: cmovneq %rcx, %rax
+; X64-NEXT: bsrq %rdi, %rax
; X64-NEXT: xorq $63, %rax
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 68defaff78d37d..0eabfeae853f79 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -246,10 +246,9 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
;
; X64-LABEL: ctlz_i8_zero_test:
; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsrl %eax, %ecx
+; X64-NEXT: movzbl %dil, %ecx
; X64-NEXT: movl $15, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %ecx, %eax
; X64-NEXT: xorl $7, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -317,9 +316,8 @@ define i16 @ctlz_i16_zero_test(i16 %n) {
;
; X64-LABEL: ctlz_i16_zero_test:
; X64: # %bb.0:
-; X64-NEXT: bsrw %di, %cx
; X64-NEXT: movw $31, %ax
-; X64-NEXT: cmovnew %cx, %ax
+; X64-NEXT: bsrw %di, %ax
; X64-NEXT: xorl $15, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -372,9 +370,8 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
;
; X64-LABEL: ctlz_i32_zero_test:
; X64: # %bb.0:
-; X64-NEXT: bsrl %edi, %ecx
; X64-NEXT: movl $63, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: retq
;
@@ -442,9 +439,8 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
;
; X64-LABEL: ctlz_i64_zero_test:
; X64: # %bb.0:
-; X64-NEXT: bsrq %rdi, %rcx
; X64-NEXT: movl $127, %eax
-; X64-NEXT: cmovneq %rcx, %rax
+; X64-NEXT: bsrq %rdi, %rax
; X64-NEXT: xorq $63, %rax
; X64-NEXT: retq
;
@@ -613,9 +609,8 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
;
; X64-LABEL: ctlz_bsr_zero_test:
; X64: # %bb.0:
-; X64-NEXT: bsrl %edi, %ecx
; X64-NEXT: movl $63, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_bsr_zero_test:
@@ -983,10 +978,9 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
;
; X64-LABEL: ctlz_xor7_i8_false:
; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: bsrl %eax, %ecx
+; X64-NEXT: movzbl %dil, %ecx
; X64-NEXT: movl $15, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
@@ -1094,9 +1088,8 @@ define i32 @ctlz_xor31_i32_false(i32 %x) {
;
; X64-LABEL: ctlz_xor31_i32_false:
; X64: # %bb.0:
-; X64-NEXT: bsrl %edi, %ecx
; X64-NEXT: movl $63, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_xor31_i32_false:
@@ -1239,9 +1232,8 @@ define i64 @ctlz_i32_sext(i32 %x) {
;
; X64-LABEL: ctlz_i32_sext:
; X64: # %bb.0:
-; X64-NEXT: bsrl %edi, %ecx
; X64-NEXT: movl $63, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_i32_sext:
@@ -1302,9 +1294,8 @@ define i64 @ctlz_i32_zext(i32 %x) {
;
; X64-LABEL: ctlz_i32_zext:
; X64: # %bb.0:
-; X64-NEXT: bsrl %edi, %ecx
; X64-NEXT: movl $63, %eax
-; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: ctlz_i32_zext:
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index 30e5cccfb21982..fc7d6fb2f2b6c2 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -393,9 +393,8 @@ define i64 @cttz_i64_zero_test(i64 %n) {
;
; X64-LABEL: cttz_i64_zero_test:
; X64: # %bb.0:
-; X64-NEXT: bsfq %rdi, %rcx
; X64-NEXT: movl $64, %eax
-; X64-NEXT: cmovneq %rcx, %rax
+; X64-NEXT: rep bsfq %rdi, %rax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i64_zero_test:
diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll
index 0fb4ed7905287c..5f306e998398f2 100644
--- a/llvm/test/CodeGen/X86/pr92569.ll
+++ b/llvm/test/CodeGen/X86/pr92569.ll
@@ -4,13 +4,11 @@
define void @PR92569(i64 %arg, <8 x i8> %arg1) {
; CHECK-LABEL: PR92569:
; CHECK: # %bb.0:
-; CHECK-NEXT: bsfq %rdi, %rax
-; CHECK-NEXT: movl $64, %ecx
-; CHECK-NEXT: cmovneq %rax, %rcx
-; CHECK-NEXT: shrb $3, %cl
+; CHECK-NEXT: movl $64, %eax
+; CHECK-NEXT: rep bsfq %rdi, %rax
+; CHECK-NEXT: shrb $3, %al
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: andl $15, %eax
+; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: movzbl -24(%rsp,%rax), %eax
; CHECK-NEXT: movl %eax, 0
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 6be79edbe51e10..426587a84ce179 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -234,16 +234,15 @@ define i256 @test2(i256 %a) nounwind {
; ILP-NEXT: xorq $63, %rdx
; ILP-NEXT: andq %rsi, %r11
; ILP-NEXT: movl $127, %esi
-; ILP-NEXT: bsrq %r11, %r8
-; ILP-NEXT: cmoveq %rsi, %r8
-; ILP-NEXT: xorq $63, %r8
-; ILP-NEXT: addq $64, %r8
+; ILP-NEXT: bsrq %r11, %rsi
+; ILP-NEXT: xorq $63, %rsi
+; ILP-NEXT: addq $64, %rsi
; ILP-NEXT: testq %r10, %r10
-; ILP-NEXT: cmovneq %rdx, %r8
-; ILP-NEXT: subq $-128, %r8
+; ILP-NEXT: cmovneq %rdx, %rsi
+; ILP-NEXT: subq $-128, %rsi
; ILP-NEXT: orq %rdi, %r9
-; ILP-NEXT: cmovneq %rcx, %r8
-; ILP-NEXT: movq %r8, (%rax)
+; ILP-NEXT: cmovneq %rcx, %rsi
+; ILP-NEXT: movq %rsi, (%rax)
; ILP-NEXT: movq $0, 8(%rax)
; ILP-NEXT: retq
;
@@ -274,16 +273,15 @@ define i256 @test2(i256 %a) nounwind {
; HYBRID-NEXT: xorq $63, %rdx
; HYBRID-NEXT: andq %rsi, %r11
; HYBRID-NEXT: movl $127, %esi
-; HYBRID-NEXT: bsrq %r11, %r8
-; HYBRID-NEXT: cmoveq %rsi, %r8
-; HYBRID-NEXT: xorq $63, %r8
-; HYBRID-NEXT: addq $64, %r8
+; HYBRID-NEXT: bsrq %r11, %rsi
+; HYBRID-NEXT: xorq $63, %rsi
+; HYBRID-NEXT: addq $64, %rsi
; HYBRID-NEXT: testq %r10, %r10
-; HYBRID-NEXT: cmovneq %rdx, %r8
-; HYBRID-NEXT: subq $-128, %r8
+; HYBRID-NEXT: cmovneq %rdx, %rsi
+; HYBRID-NEXT: subq $-128, %rsi
; HYBRID-NEXT: orq %rdi, %r9
-; HYBRID-NEXT: cmovneq %rcx, %r8
-; HYBRID-NEXT: movq %r8, (%rax)
+; HYBRID-NEXT: cmovneq %rcx, %rsi
+; HYBRID-NEXT: movq %rsi, (%rax)
; HYBRID-NEXT: movq $0, 8(%rax)
; HYBRID-NEXT: retq
;
@@ -314,16 +312,15 @@ define i256 @test2(i256 %a) nounwind {
; BURR-NEXT: xorq $63, %rdx
; BURR-NEXT: andq %rsi, %r11
; BURR-NEXT: movl $127, %esi
-; BURR-NEXT: bsrq %r11, %r8
-; BURR-NEXT: cmoveq %rsi, %r8
-; BURR-NEXT: xorq $63, %r8
-; BURR-NEXT: addq $64, %r8
+; BURR-NEXT: bsrq %r11, %rsi
+; BURR-NEXT: xorq $63, %rsi
+; BURR-NEXT: addq $64, %rsi
; BURR-NEXT: testq %r10, %r10
-; BURR-NEXT: cmovneq %rdx, %r8
-; BURR-NEXT: subq $-128, %r8
+; BURR-NEXT: cmovneq %rdx, %rsi
+; BURR-NEXT: subq $-128, %rsi
; BURR-NEXT: orq %rdi, %r9
-; BURR-NEXT: cmovneq %rcx, %r8
-; BURR-NEXT: movq %r8, (%rax)
+; BURR-NEXT: cmovneq %rcx, %rsi
+; BURR-NEXT: movq %rsi, (%rax)
; BURR-NEXT: movq $0, 8(%rax)
; BURR-NEXT: retq
;
@@ -351,19 +348,18 @@ define i256 @test2(i256 %a) nounwind {
; SRC-NEXT: cmovneq %rcx, %rdx
; SRC-NEXT: bsrq %r10, %rcx
; SRC-NEXT: xorq $63, %rcx
+; SRC-NEXT: movl $127, %esi
; SRC-NEXT: bsrq %r11, %rsi
-; SRC-NEXT: movl $127, %r8d
-; SRC-NEXT: cmovneq %rsi, %r8
-; SRC-NEXT: xorq $63, %r8
-; SRC-NEXT: addq $64, %r8
+; SRC-NEXT: xorq $63, %rsi
+; SRC-NEXT: addq $64, %rsi
; SRC-NEXT: testq %r10, %r10
-; SRC-NEXT: cmovneq %rcx, %r8
-; SRC-NEXT: subq $-128, %r8
+; SRC-NEXT: cmovneq %rcx, %rsi
+; SRC-NEXT: subq $-128, %rsi
; SRC-NEXT: orq %r9, %rdi
-; SRC-NEXT: cmovneq %rdx, %r8
+; SRC-NEXT: cmovneq %rdx, %rsi
; SRC-NEXT: xorps %xmm0, %xmm0
; SRC-NEXT: movaps %xmm0, 16(%rax)
-; SRC-NEXT: movq %r8, (%rax)
+; SRC-NEXT: movq %rsi, (%rax)
; SRC-NEXT: movq $0, 8(%rax)
; SRC-NEXT: retq
;
@@ -372,12 +368,11 @@ define i256 @test2(i256 %a) nounwind {
; LIN-NEXT: movq %rdi, %rax
; LIN-NEXT: xorps %xmm0, %xmm0
; LIN-NEXT: movaps %xmm0, 16(%rdi)
-; LIN-NEXT: movq %rsi, %rdi
-; LIN-NEXT: negq %rdi
-; LIN-NEXT: andq %rsi, %rdi
-; LIN-NEXT: bsrq %rdi, %rsi
; LIN-NEXT: movl $127, %edi
-; LIN-NEXT: cmovneq %rsi, %rdi
+; LIN-NEXT: movq %rsi, %r9
+; LIN-NEXT: negq %r9
+; LIN-NEXT: andq %rsi, %r9
+; LIN-NEXT: bsrq %r9, %rdi
; LIN-NEXT: xorq $63, %rdi
; LIN-NEXT: addq $64, %rdi
; LIN-NEXT: xorl %esi, %esi
@@ -415,7 +410,6 @@ define i256 @test2(i256 %a) nounwind {
define i256 @test3(i256 %n) nounwind {
; ILP-LABEL: test3:
; ILP: # %bb.0:
-; ILP-NEXT: pushq %rbx
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: xorps %xmm0, %xmm0
; ILP-NEXT: movaps %xmm0, 16(%rdi)
@@ -429,34 +423,32 @@ define i256 @test3(i256 %n) nounwind {
; ILP-NEXT: sbbq %r8, %r9
; ILP-NEXT: notq %r8
; ILP-NEXT: andq %r9, %r8
-; ILP-NEXT: bsrq %r8, %rbx
+; ILP-NEXT: bsrq %r8, %r9
; ILP-NEXT: notq %rdx
; ILP-NEXT: andq %r10, %rdx
-; ILP-NEXT: bsrq %rdx, %r9
-; ILP-NEXT: xorq $63, %rbx
+; ILP-NEXT: bsrq %rdx, %r10
+; ILP-NEXT: xorq $63, %r9
; ILP-NEXT: notq %rcx
; ILP-NEXT: andq %r11, %rcx
-; ILP-NEXT: bsrq %rcx, %r10
+; ILP-NEXT: bsrq %rcx, %r11
+; ILP-NEXT: xorq $63, %r11
+; ILP-NEXT: orq $64, %r11
+; ILP-NEXT: testq %r8, %r8
+; ILP-NEXT: cmovneq %r9, %r11
; ILP-NEXT: xorq $63, %r10
-; ILP-NEXT: orq $64, %r10
; ILP-NEXT: notq %rsi
-; ILP-NEXT: testq %r8, %r8
-; ILP-NEXT: cmovneq %rbx, %r10
-; ILP-NEXT: xorq $63, %r9
; ILP-NEXT: andq %rdi, %rsi
; ILP-NEXT: movl $127, %edi
-; ILP-NEXT: bsrq %rsi, %rsi
-; ILP-NEXT: cmoveq %rdi, %rsi
-; ILP-NEXT: xorq $63, %rsi
-; ILP-NEXT: addq $64, %rsi
+; ILP-NEXT: bsrq %rsi, %rdi
+; ILP-NEXT: xorq $63, %rdi
+; ILP-NEXT: addq $64, %rdi
; ILP-NEXT: testq %rdx, %rdx
-; ILP-NEXT: cmovneq %r9, %rsi
-; ILP-NEXT: subq $-128, %rsi
+; ILP-NEXT: cmovneq %r10, %rdi
+; ILP-NEXT: subq $-128, %rdi
; ILP-NEXT: orq %r8, %rcx
-; ILP-NEXT: cmovneq %r10, %rsi
-; ILP-NEXT: movq %rsi, (%rax)
+; ILP-NEXT: cmovneq %r11, %rdi
+; ILP-NEXT: movq %rdi, (%rax)
; ILP-NEXT: movq $0, 8(%rax)
-; ILP-NEXT: popq %rbx
; ILP-NEXT: retq
;
; HYBRID-LABEL: test3:
@@ -491,16 +483,15 @@ define i256 @test3(i256 %n) nounwind {
; HYBRID-NEXT: notq %rsi
; HYBRID-NEXT: andq %rdi, %rsi
; HYBRID-NEXT: movl $127, %edi
-; HYBRID-NEXT: bsrq %rsi, %rsi
-; HYBRID-NEXT: cmoveq %rdi, %rsi
-; HYBRID-NEXT: xorq $63, %rsi
-; HYBRID-NEXT: addq $64, %rsi
+; HYBRID-NEXT: bsrq %rsi, %rdi
+; HYBRID-NEXT: xorq $63, %rdi
+; HYBRID-NEXT: addq $64, %rdi
; HYBRID-NEXT: testq %rdx, %rdx
-; HYBRID-NEXT: cmovneq %r10, %rsi
-; HYBRID-NEXT: subq $-128, %rsi
+; HYBRID-NEXT: cmovneq %r10, %rdi
+; HYBRID-NEXT: subq $-128, %rdi
; HYBRID-NEXT: orq %r8, %rcx
-; HYBRID-NEXT: cmovneq %r9, %rsi
-; HYBRID-NEXT: movq %rsi, (%rax)
+; HYBRID-NEXT: cmovneq %r9, %rdi
+; HYBRID-NEXT: movq %rdi, (%rax)
; HYBRID-NEXT: movq $0, 8(%rax)
; HYBRID-NEXT: popq %rbx
; HYBRID-NEXT: retq
@@ -537,16 +528,15 @@ define i256 @test3(i256 %n) nounwind {
; BURR-NEXT: notq %rsi
; BURR-NEXT: andq %rdi, %rsi
; BURR-NEXT: movl $127, %edi
-; BURR-NEXT: bsrq %rsi, %rsi
-; BURR-NEXT: cmoveq %rdi, %rsi
-; BURR-NEXT: xorq $63, %rsi
-; BURR-NEXT: addq $64, %rsi
+; BURR-NEXT: bsrq %rsi, %rdi
+; BURR-NEXT: xorq $63, %rdi
+; BURR-NEXT: addq $64, %rdi
; BURR-NEXT: testq %rdx, %rdx
-; BURR-NEXT: cmovneq %r10, %rsi
-; BURR-NEXT: subq $-128, %rsi
+; BURR-NEXT: cmovneq %r10, %rdi
+; BURR-NEXT: subq $-128, %rdi
; BURR-NEXT: orq %r8, %rcx
-; BURR-NEXT: cmovneq %r9, %rsi
-; BURR-NEXT: movq %rsi, (%rax)
+; BURR-NEXT: cmovneq %r9, %rdi
+; BURR-NEXT: movq %rdi, (%rax)
; BURR-NEXT: movq $0, 8(%rax)
; BURR-NEXT: popq %rbx
; BURR-NEXT: retq
@@ -579,9 +569,8 @@ define i256 @test3(i256 %n) nounwind {
; SRC-NEXT: cmovneq %rdi, %r9
; SRC-NEXT: bsrq %rdx, %rdi
; SRC-NEXT: xorq $63, %rdi
-; SRC-NEXT: bsrq %rsi, %rsi
; SRC-NEXT: movl $127, %r10d
-; SRC-NEXT: cmovneq %rsi, %r10
+; SRC-NEXT: bsrq %rsi, %r10
; SRC-NEXT: xorq $63, %r10
; SRC-NEXT: addq $64, %r10
; SRC-NEXT: testq %rdx, %rdx
@@ -600,13 +589,12 @@ define i256 @test3(i256 %n) nounwind {
; LIN-NEXT: movq %rdi, %rax
; LIN-NEXT: xorps %xmm0, %xmm0
; LIN-NEXT: movaps %xmm0, 16(%rdi)
+; LIN-NEXT: movl $127, %r9d
; LIN-NEXT: movq %rsi, %rdi
; LIN-NEXT: negq %rdi
; LIN-NEXT: notq %rsi
; LIN-NEXT: andq %rdi, %rsi
-; LIN-NEXT: bsrq %rsi, %rsi
-; LIN-NEXT: movl $127, %r9d
-; LIN-NEXT: cmovneq %rsi, %r9
+; LIN-NEXT: bsrq %rsi, %r9
; LIN-NEXT: xorq $63, %r9
; LIN-NEXT: addq $64, %r9
; LIN-NEXT: xorl %edi, %edi
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 954c05bdb20767..2ab63392c7076a 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -684,12 +684,6 @@ static const X86FoldTableEntry Table1[] = {
{X86::BLSR64rr, X86::BLSR64rm, 0},
{X86::BLSR64rr_EVEX, X86::BLSR64rm_EVEX, 0},
{X86::BLSR64rr_NF, X86::BLSR64rm_NF, 0},
- {X86::BSF16rr, X86::BSF16rm, 0},
- {X86::BSF32rr, X86::BSF32rm, 0},
- {X86::BSF64rr, X86::BSF64rm, 0},
- {X86::BSR16rr, X86::BSR16rm, 0},
- {X86::BSR32rr, X86::BSR32rm, 0},
- {X86::BSR64rr, X86::BSR64rm, 0},
{X86::BZHI32rr, X86::BZHI32rm, 0},
{X86::BZHI32rr_EVEX, X86::BZHI32rm_EVEX, 0},
{X86::BZHI32rr_NF, X86::BZHI32rm_NF, 0},
@@ -2072,6 +2066,12 @@ static const X86FoldTableEntry Table2[] = {
{X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16},
{X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16},
{X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16},
+ {X86::BSF16rr, X86::BSF16rm, 0},
+ {X86::BSF32rr, X86::BSF32rm, 0},
+ {X86::BSF64rr, X86::BSF64rm, 0},
+ {X86::BSR16rr, X86::BSR16rm, 0},
+ {X86::BSR32rr, X86::BSR32rm, 0},
+ {X86::BSR64rr, X86::BSR64rm, 0},
{X86::CMOV16rr, X86::CMOV16rm, 0},
{X86::CMOV16rr_ND, X86::CMOV16rm_ND, 0},
{X86::CMOV32rr, X86::CMOV32rm, 0},
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s
index 6483809deda3a9..0bd5f451e2e341 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s
@@ -15,12 +15,12 @@ bsf %rax, %rcx
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 655
+# CHECK-NEXT: Total Cycles: 663
# CHECK-NEXT: Total uOps: 1000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 1.53
-# CHECK-NEXT: IPC: 0.61
+# CHECK-NEXT: uOps Per Cycle: 1.51
+# CHECK-NEXT: IPC: 0.60
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Instruction Info:
>From aad5d6373f5ac53ac3b774eb029f814070244634 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 21 Jan 2025 10:42:46 +0000
Subject: [PATCH 2/4] [X86] Remove CTTZ i32 -> i64 promotion
With fall through support, we don't need to load i64 constants anymore
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 1 -
llvm/test/CodeGen/X86/cttz.ll | 19 +-
llvm/test/CodeGen/X86/known-never-zero.ll | 216 ++++++++--------------
llvm/test/CodeGen/X86/pr89877.ll | 8 +-
llvm/test/CodeGen/X86/pr90847.ll | 16 +-
5 files changed, 89 insertions(+), 171 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 032704aede2950..7c7e16abff7fe3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -436,7 +436,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
if (Subtarget.is64Bit()) {
- setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64);
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
}
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index fc7d6fb2f2b6c2..db949827af0074 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -324,11 +324,8 @@ define i32 @cttz_i32_zero_test(i32 %n) {
;
; X64-LABEL: cttz_i32_zero_test:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i32_zero_test:
@@ -686,10 +683,8 @@ define i64 @cttz_i32_sext(i32 %x) {
;
; X64-LABEL: cttz_i32_sext:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i32_sext:
@@ -743,10 +738,8 @@ define i64 @cttz_i32_zext(i32 %x) {
;
; X64-LABEL: cttz_i32_zext:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
;
; X86-CLZ-LABEL: cttz_i32_zext:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 6c0aaeb451e14a..63336ffa7c6c8a 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -51,12 +51,9 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) {
;
; X64-LABEL: or_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: orl %esi, %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%z = or i32 %x, %y
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -104,13 +101,11 @@ define i32 @select_maybe_zero(i1 %c, i32 %x) {
; X64-LABEL: select_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: orl $1, %esi
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: testb $1, %dil
-; X64-NEXT: cmovnel %esi, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: cmovnel %esi, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%y = or i32 %x, 1
%z = select i1 %c, i32 %y, i32 0
@@ -201,14 +196,11 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) {
;
; X64-LABEL: shl_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shll %cl, %esi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rsi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %esi, %eax
; X64-NEXT: retq
%z = shl nuw nsw i32 %y, %x
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -252,12 +244,10 @@ define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: uaddsat_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: addl %esi, %edi
-; X64-NEXT: movl $-1, %eax
-; X64-NEXT: cmovael %edi, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $-1, %ecx
+; X64-NEXT: cmovael %edi, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = call i32 @llvm.uadd.sat.i32(i32 %x, i32 %y)
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -306,13 +296,10 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) {
;
; X64-LABEL: umax_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: cmpl %esi, %edi
; X64-NEXT: cmoval %edi, %esi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rsi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %esi, %eax
; X64-NEXT: retq
%z = call i32 @llvm.umax.i32(i32 %x, i32 %y)
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -365,12 +352,10 @@ define i32 @umin_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: umin_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: cmpl $54, %edi
-; X64-NEXT: movl $54, %eax
-; X64-NEXT: cmovbl %edi, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $54, %ecx
+; X64-NEXT: cmovbl %edi, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = call i32 @llvm.umin.i32(i32 %x, i32 54)
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -479,12 +464,10 @@ define i32 @smin_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: smin_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: cmpl $54, %edi
-; X64-NEXT: movl $54, %eax
-; X64-NEXT: cmovll %edi, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $54, %ecx
+; X64-NEXT: cmovll %edi, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = call i32 @llvm.smin.i32(i32 %x, i32 54)
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -593,12 +576,10 @@ define i32 @smax_known_zero(i32 %x, i32 %y) {
; X64-LABEL: smax_known_zero:
; X64: # %bb.0:
; X64-NEXT: testl %edi, %edi
-; X64-NEXT: movl $-1, %eax
-; X64-NEXT: cmovnsl %edi, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $-1, %ecx
+; X64-NEXT: cmovnsl %edi, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = call i32 @llvm.smax.i32(i32 %x, i32 -1)
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -646,13 +627,10 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotr_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: rorl %cl, %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%shr = lshr i32 %x, %y
%sub = sub i32 32, %y
@@ -700,13 +678,10 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotr_with_fshr_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: rorl %cl, %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%z = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 %y)
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -754,13 +729,10 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotl_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: roll %cl, %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%shl = shl i32 %x, %y
%sub = sub i32 32, %y
@@ -808,13 +780,10 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotl_with_fshl_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: roll %cl, %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%z = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %y)
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -880,14 +849,11 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) {
;
; X64-LABEL: sra_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: sarl %cl, %esi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rsi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %esi, %eax
; X64-NEXT: retq
%z = ashr exact i32 %y, %x
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -953,14 +919,11 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) {
;
; X64-LABEL: srl_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %esi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rsi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %esi, %eax
; X64-NEXT: retq
%z = lshr exact i32 %y, %x
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1007,11 +970,9 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %esi
-; X64-NEXT: # kill: def $eax killed $eax def $rax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %ecx
+; X64-NEXT: rep bsfl %eax, %ecx
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: retq
%z = udiv exact i32 %x, %y
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1058,11 +1019,9 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cltd
; X64-NEXT: idivl %esi
-; X64-NEXT: # kill: def $eax killed $eax def $rax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %ecx
+; X64-NEXT: rep bsfl %eax, %ecx
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: retq
%z = sdiv exact i32 %x, %y
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1103,13 +1062,10 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) {
;
; X64-LABEL: add_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: orl $1, %edi
; X64-NEXT: addl %esi, %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%x = or i32 %xx, 1
%z = add nsw i32 %x, %y
@@ -1182,13 +1138,11 @@ define i32 @sub_maybe_zero(i32 %x) {
;
; X64-LABEL: sub_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: orl $64, %eax
-; X64-NEXT: subl %edi, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: orl $64, %ecx
+; X64-NEXT: subl %edi, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%y = or i32 %x, 64
%z = sub i32 %y, %x
@@ -1208,12 +1162,9 @@ define i32 @sub_maybe_zero2(i32 %x) {
;
; X64-LABEL: sub_maybe_zero2:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: negl %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%z = sub i32 0, %x
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1233,13 +1184,10 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) {
;
; X64-LABEL: mul_known_nonzero_nsw:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: orl $256, %esi # imm = 0x100
; X64-NEXT: imull %edi, %esi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rsi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %esi, %eax
; X64-NEXT: retq
%y = or i32 %yy, 256
%z = mul nsw i32 %y, %x
@@ -1260,13 +1208,10 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) {
;
; X64-LABEL: mul_known_nonzero_nuw:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: orl $256, %esi # imm = 0x100
; X64-NEXT: imull %edi, %esi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rsi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %esi, %eax
; X64-NEXT: retq
%y = or i32 %yy, 256
%z = mul nuw i32 %y, %x
@@ -1286,12 +1231,9 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
;
; X64-LABEL: mul_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: imull %esi, %edi
-; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: rep bsfq %rax, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
%z = mul nuw nsw i32 %y, %x
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1321,11 +1263,9 @@ define i32 @bitcast_known_nonzero(<2 x i16> %xx) {
; X64-NEXT: vcvttps2dq %xmm0, %xmm0
; X64-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,256,u,u,u,u,u,u]
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: vmovd %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%x = shl nuw nsw <2 x i16> <i16 256, i16 256>, %xx
%z = bitcast <2 x i16> %x to i32
@@ -1344,11 +1284,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
;
; X64-LABEL: bitcast_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: vmovd %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = bitcast <2 x i16> %x to i32
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1365,11 +1303,9 @@ define i32 @bitcast_from_float(float %x) {
;
; X64-LABEL: bitcast_from_float:
; X64: # %bb.0:
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: vmovd %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = bitcast float %x to i32
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1412,11 +1348,9 @@ define i32 @zext_maybe_zero(i16 %x) {
;
; X64-LABEL: zext_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movzwl %di, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = zext i16 %x to i32
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1459,11 +1393,9 @@ define i32 @sext_maybe_zero(i16 %x) {
;
; X64-LABEL: sext_maybe_zero:
; X64: # %bb.0:
-; X64-NEXT: movswl %di, %eax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movswl %di, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = sext i16 %x to i32
%r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll
index 19baad26583ada..a40ad8f9412788 100644
--- a/llvm/test/CodeGen/X86/pr89877.ll
+++ b/llvm/test/CodeGen/X86/pr89877.ll
@@ -20,11 +20,9 @@ define i32 @sext_known_nonzero(i16 %xx) {
; X64-NEXT: movl $256, %eax # imm = 0x100
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shll %cl, %eax
-; X64-NEXT: movswq %ax, %rax
-; X64-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: rep bsfq %rcx, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: movswl %ax, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%x = shl i16 256, %xx
%z = sext i16 %x to i32
diff --git a/llvm/test/CodeGen/X86/pr90847.ll b/llvm/test/CodeGen/X86/pr90847.ll
index f2d43c3ed8d5bd..11669f321704e3 100644
--- a/llvm/test/CodeGen/X86/pr90847.ll
+++ b/llvm/test/CodeGen/X86/pr90847.ll
@@ -14,11 +14,9 @@ define i32 @PR90847(<8 x float> %x) nounwind {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vminps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovmskps %ymm0, %eax
-; AVX1-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: rep bsfq %rcx, %rax
-; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX1-NEXT: vmovmskps %ymm0, %ecx
+; AVX1-NEXT: movl $32, %eax
+; AVX1-NEXT: rep bsfl %ecx, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -31,11 +29,9 @@ define i32 @PR90847(<8 x float> %x) nounwind {
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-NEXT: vminps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: rep bsfq %rcx, %rax
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vmovmskps %ymm0, %ecx
+; AVX2-NEXT: movl $32, %eax
+; AVX2-NEXT: rep bsfl %ecx, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
entry:
>From a2fb67ba520070a9f0849c5b6d0bc434075e0bc8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 21 Jan 2025 10:56:03 +0000
Subject: [PATCH 3/4] missing whitespace
---
llvm/lib/Target/X86/X86InstrFragments.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 8f038330cd2398..ddbc7c55a6113b 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -696,7 +696,7 @@ def def32 : PatLeaf<(i32 GR32:$src), [{
N->getOpcode() != ISD::AssertZext &&
N->getOpcode() != ISD::AssertAlign &&
N->getOpcode() != ISD::FREEZE &&
- !((N->getOpcode() == X86ISD::BSF|| N->getOpcode() == X86ISD::BSR) &&
+ !((N->getOpcode() == X86ISD::BSF || N->getOpcode() == X86ISD::BSR) &&
(!N->getOperand(0).isUndef() && !isa<ConstantSDNode>(N->getOperand(0))));
}]>;
>From cf42716949909450e5c47246425461a3b6921c52 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 23 Jan 2025 11:59:53 +0000
Subject: [PATCH 4/4] Improve isUseDefConvertible description of return value
---
llvm/lib/Target/X86/X86InstrInfo.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f65316aed65dee..794aa921ca254d 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5220,6 +5220,7 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
}
/// Check whether the use can be converted to remove a comparison against zero.
+/// Returns the EFLAGS condition and the operand that we are comparing against zero.
static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
More information about the llvm-commits
mailing list