[llvm] f9de132 - [X86] Promote i8/i16 CTTZ (BSF) instructions and remove speculation branch

Wed Aug 24 09:28:44 PDT 2022

Author: Simon Pilgrim
Date: 2022-08-24T17:28:18+01:00
New Revision: f9de13232f9b1e8f35625a3a4af4c529ebc7663d

URL: https://github.com/llvm/llvm-project/commit/f9de13232f9b1e8f35625a3a4af4c529ebc7663d
DIFF: https://github.com/llvm/llvm-project/commit/f9de13232f9b1e8f35625a3a4af4c529ebc7663d.diff

LOG: [X86] Promote i8/i16 CTTZ (BSF) instructions and remove speculation branch

This patch adds a Type operand to the TLI isCheapToSpeculateCttz/isCheapToSpeculateCtlz callbacks, allowing targets to decide whether branches should occur on a type-by-type/legality basis.

For X86, this patch proposes to allow CTTZ speculation for i8/i16 types that will lower to promoted i32 BSF instructions by masking the operand above the msb (we already do something similar for i8/i16 TZCNT). This required a minor tweak to CTTZ lowering - if the src operand is known never zero (i.e. due to the promotion masking) we can remove the CMOV zero src handling.

Although BSF isn't very fast, most CPUs from the last 20 years don't do that bad a job with it, although there are some annoying passthrough EFLAGS dependencies. Additionally, now that we emit 'REP BSF' in most cases, we are tending towards assuming this will most likely be executed as a TZCNT instruction on any semi-modern CPU.

Differential Revision: https://reviews.llvm.org/D132520

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/lib/Target/Hexagon/HexagonISelLowering.h
    llvm/lib/Target/Mips/MipsISelLowering.cpp
    llvm/lib/Target/Mips/MipsISelLowering.h
    llvm/lib/Target/NVPTX/NVPTXISelLowering.h
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/SystemZ/SystemZISelLowering.h
    llvm/lib/Target/VE/VEISelLowering.h
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/test/Analysis/CostModel/X86/cttz.ll
    llvm/test/CodeGen/X86/clz.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 16996300e7bb9..6bdc620fc18f6 100644

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1474,13 +1474,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       break;
     case Intrinsic::cttz:
       // FIXME: If necessary, this should go in target-specific overrides.
-      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
+      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
         return TargetTransformInfo::TCC_Basic;
       break;
 
     case Intrinsic::ctlz:
       // FIXME: If necessary, this should go in target-specific overrides.
-      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
+      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
         return TargetTransformInfo::TCC_Basic;
       break;
 

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 94fafcc11aaf2..bf4e85881e778 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -621,12 +621,12 @@ class TargetLoweringBase {
   }
 
   /// Return true if it is cheap to speculate a call to intrinsic cttz.
-  virtual bool isCheapToSpeculateCttz() const {
+  virtual bool isCheapToSpeculateCttz(Type *Ty) const {
     return false;
   }
 
   /// Return true if it is cheap to speculate a call to intrinsic ctlz.
-  virtual bool isCheapToSpeculateCtlz() const {
+  virtual bool isCheapToSpeculateCtlz(Type *Ty) const {
     return false;
   }
 

diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 88c01c32f4a71..4bd3ba3cc98b9 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2045,13 +2045,13 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
     return false;
 
   // If it's cheap to speculate, there's nothing to do.
+  Type *Ty = CountZeros->getType();
   auto IntrinsicID = CountZeros->getIntrinsicID();
-  if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
-      (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
+  if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||
+      (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
     return false;
 
   // Only handle legal scalar cases. Anything else requires too much work.
-  Type *Ty = CountZeros->getType();
   unsigned SizeInBits = Ty->getScalarSizeInBits();
   if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
     return false;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 004f4a520736d..673c151619faa 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -742,11 +742,11 @@ class AArch64TargetLowering : public TargetLowering {
     return true;
   }
 
-  bool isCheapToSpeculateCttz() const override {
+  bool isCheapToSpeculateCttz(Type *) const override {
     return true;
   }
 
-  bool isCheapToSpeculateCtlz() const override {
+  bool isCheapToSpeculateCtlz(Type *) const override {
     return true;
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3f16367add784..4b0dd9f1e6485 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -692,11 +692,11 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
 // profitable with the expansion for 64-bit since it's generally good to
 // speculate things.
-bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   return true;
 }
 
-bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   return true;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 73081483f1c3d..11ee9f9ff0dd5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -193,8 +193,8 @@ class AMDGPUTargetLowering : public TargetLowering {
                                     unsigned NumElem,
                                     unsigned AS) const override;
   bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
-  bool isCheapToSpeculateCttz() const override;
-  bool isCheapToSpeculateCtlz() const override;
+  bool isCheapToSpeculateCttz(Type *Ty) const override;
+  bool isCheapToSpeculateCtlz(Type *Ty) const override;
 
   bool isSDNodeAlwaysUniform(const SDNode *N) const override;
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2306193f04297..9da2cf2a9f94b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21157,11 +21157,11 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
   return false;
 }
 
-bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   return Subtarget->hasV6T2Ops();
 }
 
-bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   return Subtarget->hasV6T2Ops();
 }
 

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 8947c4add327f..9ff920f230e22 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -679,8 +679,8 @@ class VectorType;
       return (MemVT.getSizeInBits() <= 32);
     }
 
-    bool isCheapToSpeculateCttz() const override;
-    bool isCheapToSpeculateCtlz() const override;
+    bool isCheapToSpeculateCttz(Type *Ty) const override;
+    bool isCheapToSpeculateCtlz(Type *Ty) const override;
 
     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
       return VT.isScalarInteger();

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 1dc6a4cb9c894..8ef7606cda195 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -129,8 +129,8 @@ class HexagonTargetLowering : public TargetLowering {
   bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
   bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
-  bool isCheapToSpeculateCttz() const override { return true; }
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCttz(Type *) const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *) const override { return true; }
   bool isCtlzFast() const override { return true; }
 
   bool hasBitTest(SDValue X, SDValue Y) const override;

diff  --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 05f411c9e4cd3..3aee67653c32e 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -1172,11 +1172,11 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   return SDValue();
 }
 
-bool MipsTargetLowering::isCheapToSpeculateCttz() const {
+bool MipsTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   return Subtarget.hasMips32();
 }
 
-bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
+bool MipsTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   return Subtarget.hasMips32();
 }
 

diff  --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 1f921fbe94916..723be3b31dce2 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -283,8 +283,8 @@ class TargetRegisterClass;
     EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
                             ISD::NodeType) const override;
 
-    bool isCheapToSpeculateCttz() const override;
-    bool isCheapToSpeculateCtlz() const override;
+    bool isCheapToSpeculateCttz(Type *Ty) const override;
+    bool isCheapToSpeculateCtlz(Type *Ty) const override;
     bool hasBitTest(SDValue X, SDValue Y) const override;
     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                            CombineLevel Level) const override;

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index fb09f99a019d0..ae66816548f9f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -559,7 +559,7 @@ class NVPTXTargetLowering : public TargetLowering {
   // x == 0 is not undefined behavior) into a branch that checks whether x is 0
   // and avoids calling ctlz in that case.  We have a dedicated ctlz
   // instruction, so we say that ctlz is cheap to speculate.
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *Ty) const override { return true; }
 
   AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override {
     return AtomicExpansionKind::None;

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 4a08cc42fa9d7..efceab2180e7e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -790,11 +790,11 @@ namespace llvm {
       return MVT::i32;
     }
 
-    bool isCheapToSpeculateCttz() const override {
+    bool isCheapToSpeculateCttz(Type *Ty) const override {
       return true;
     }
 
-    bool isCheapToSpeculateCtlz() const override {
+    bool isCheapToSpeculateCtlz(Type *Ty) const override {
       return true;
     }
 

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 48b9d283b9638..0a405f1b5e0de 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1155,11 +1155,11 @@ bool RISCVTargetLowering::signExtendConstant(const ConstantInt *CI) const {
   return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32);
 }
 
-bool RISCVTargetLowering::isCheapToSpeculateCttz() const {
+bool RISCVTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   return Subtarget.hasStdExtZbb();
 }
 
-bool RISCVTargetLowering::isCheapToSpeculateCtlz() const {
+bool RISCVTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   return Subtarget.hasStdExtZbb();
 }
 

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 5a1acde4a5b26..dcaa7f24b4c8d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -370,8 +370,8 @@ class RISCVTargetLowering : public TargetLowering {
   bool isZExtFree(SDValue Val, EVT VT2) const override;
   bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
   bool signExtendConstant(const ConstantInt *CI) const override;
-  bool isCheapToSpeculateCttz() const override;
-  bool isCheapToSpeculateCtlz() const override;
+  bool isCheapToSpeculateCttz(Type *Ty) const override;
+  bool isCheapToSpeculateCtlz(Type *Ty) const override;
   bool hasAndNotCompare(SDValue Y) const override;
   bool hasBitTest(SDValue X, SDValue Y) const override;
   bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index b9c95274f62b8..66f0fa20f3e32 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -423,7 +423,7 @@ class SystemZTargetLowering : public TargetLowering {
       return 1;
     return TargetLowering::getNumRegisters(Context, VT);
   }
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *) const override { return true; }
   bool preferZeroCompareBranch() const override { return true; }
   bool hasBitPreservingFPLogic(EVT VT) const override {
     EVT ScVT = VT.getScalarType();

diff  --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index 087b0e215407c..b9a29e4362d64 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -236,7 +236,7 @@ class VETargetLowering : public TargetLowering {
   // VE doesn't have rem.
   bool hasStandaloneRem(EVT) const override { return false; }
   // VE LDZ instruction returns 64 if the input is zero.
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *) const override { return true; }
   // VE LDZ instruction is fast.
   bool isCtlzFast() const override { return true; }
   // VE has NND instruction.

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 84823218216b6..2c60b0c223a4f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -751,12 +751,12 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
-bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const {
+bool WebAssemblyTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   // Assume ctz is a relatively cheap operation.
   return true;
 }
 
-bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const {
+bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   // Assume clz is a relatively cheap operation.
   return true;
 }

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index d86f2e59e3d2c..15b251c613d7b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -65,8 +65,8 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
-  bool isCheapToSpeculateCttz() const override;
-  bool isCheapToSpeculateCtlz() const override;
+  bool isCheapToSpeculateCttz(Type *Ty) const override;
+  bool isCheapToSpeculateCtlz(Type *Ty) const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS,
                              Instruction *I = nullptr) const override;

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 431c2c8ca48fd..69f25d19f49bc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5820,12 +5820,13 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
   return VT.isSimple() || !isOperationExpand(Opcode, VT);
 }
 
-bool X86TargetLowering::isCheapToSpeculateCttz() const {
-  // Speculate cttz only if we can directly use TZCNT.
-  return Subtarget.hasBMI();
+bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
+  // Speculate cttz only if we can directly use TZCNT or can promote to i32.
+  return Subtarget.hasBMI() ||
+         (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
 }
 
-bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   // Speculate ctlz only if we can directly use LZCNT.
   return Subtarget.hasLZCNT();
 }
@@ -28877,6 +28878,10 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
 
+  // If src is known never zero we can skip the CMOV.
+  if (DAG.isKnownNeverZero(N0))
+    return Op;
+
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
                    DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b5cfcda519de4..184f53a721035 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1033,9 +1033,9 @@ namespace llvm {
     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                           const MachineFunction &MF) const override;
 
-    bool isCheapToSpeculateCttz() const override;
+    bool isCheapToSpeculateCttz(Type *Ty) const override;
 
-    bool isCheapToSpeculateCtlz() const override;
+    bool isCheapToSpeculateCtlz(Type *Ty) const override;
 
     bool isCtlzFast() const override;
 

diff  --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index 0d6bff16aa536..e456ecd4caee4 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=-bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,NOBMI
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,BMI
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse4.2 | FileCheck %s -check-prefixes=BMI,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx | FileCheck %s -check-prefixes=BMI,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=BMI,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=BMI,AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512VPOPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BITALG
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=-bmi,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE2,NOBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE2,BMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse4.2 | FileCheck %s -check-prefixes=CHECK,BMI,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx | FileCheck %s -check-prefixes=CHECK,BMI,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=CHECK,BMI,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512VPOPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BITALG
 
 ; Verify the cost of scalar trailing zero count instructions.
 
@@ -70,52 +70,36 @@ define i32 @var_cttz_i32u(i32 %a) {
 }
 
 define i16 @var_cttz_i16(i16 %a) {
-; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
-;
-; BMI-LABEL: 'var_cttz_i16'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
+; CHECK-LABEL: 'var_cttz_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
 ;
   %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 0)
   ret i16 %cttz
 }
 
 define i16 @var_cttz_i16u(i16 %a) {
-; NOBMI-LABEL: 'var_cttz_i16u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
-;
-; BMI-LABEL: 'var_cttz_i16u'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
+; CHECK-LABEL: 'var_cttz_i16u'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
 ;
   %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 1)
   ret i16 %cttz
 }
 
 define i8 @var_cttz_i8(i8 %a) {
-; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
-;
-; BMI-LABEL: 'var_cttz_i8'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
+; CHECK-LABEL: 'var_cttz_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
 ;
   %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 0)
   ret i8 %cttz
 }
 
 define i8 @var_cttz_i8u(i8 %a) {
-; NOBMI-LABEL: 'var_cttz_i8u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
-;
-; BMI-LABEL: 'var_cttz_i8u'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
+; CHECK-LABEL: 'var_cttz_i8u'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
 ;
   %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 1)
   ret i8 %cttz

diff  --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll
index aeb14820f00ed..ce9e3118f1ebd 100644
--- a/llvm/test/CodeGen/X86/clz.ll
+++ b/llvm/test/CodeGen/X86/clz.ll
@@ -510,34 +510,20 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
   ret i64 %tmp1
 }
 
-; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+; Promote i8 cttz to i32 and mask bit8 to prevent (slow) zero-src bsf case.
 define i8 @cttz_i8_zero_test(i8 %n) {
 ; X86-LABEL: cttz_i8_zero_test:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB12_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB12_1:
-; X86-NEXT:    movb $8, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
 ;
 ; X64-LABEL: cttz_i8_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb %dil, %dil
-; X64-NEXT:    je .LBB12_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB12_1:
-; X64-NEXT:    movb $8, %al
+; X64-NEXT:    orl $256, %edi # imm = 0x100
+; X64-NEXT:    rep bsfl %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -559,34 +545,22 @@ define i8 @cttz_i8_zero_test(i8 %n) {
   ret i8 %tmp1
 }
 
-; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+; Promote i16 cttz to i32 and mask bit16 to prevent (slow) zero-src bsf case.
 define i16 @cttz_i16_zero_test(i16 %n) {
 ; X86-LABEL: cttz_i16_zero_test:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    je .LBB13_1
-; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    movl $65536, %eax # imm = 0x10000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB13_1:
-; X86-NEXT:    movw $16, %ax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
 ;
 ; X64-LABEL: cttz_i16_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testw %di, %di
-; X64-NEXT:    je .LBB13_1
-; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    orl $65536, %edi # imm = 0x10000
 ; X64-NEXT:    rep bsfl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB13_1:
-; X64-NEXT:    movw $16, %ax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: cttz_i16_zero_test:
 ; X86-CLZ:       # %bb.0: