[llvm] [X86] Distribute Certain Bitwise Operations over SELECT (PR #136555)

Mon Apr 21 21:02:07 PDT 2025

https://github.com/mskamp updated https://github.com/llvm/llvm-project/pull/136555

>From 1c2be64493798a299c534f4ffb6fd2576e048676 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Fri, 18 Apr 2025 11:04:22 +0200
Subject: [PATCH 1/3] [SDAG] Make Select-with-Identity-Fold More Flexible; NFC

This change adds new parameters to the method
`shouldFoldSelectWithIdentityConstant()`. The method now takes the
opcode of the select node and the non-identity operand of the select
node. To gain access to the appropriate arguments, the call of
`shouldFoldSelectWithIdentityConstant()` is moved after all other checks
have been performed. Moreover, this change adjusts the precondition of
the fold so that it would work for `SELECT` nodes in addition to
`VSELECT` nodes.

No functional change is intended because all implementations of
`shouldFoldSelectWithIdentityConstant()` are adjusted such that they
restrict the fold to a `VSELECT` node; the same restriction as before.

The rationale of this change is to make more fine grained decisions
possible when to revert the InstCombine canonicalization of
`(select c (binop x y) y)` to `(binop (select c x idc) y)` in the
backends.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  6 ++--
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 32 +++++++++++--------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  6 ++--
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  6 ++--
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  8 +++--
 llvm/lib/Target/ARM/ARMISelLowering.h         |  6 ++--
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  8 +++--
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  6 ++--
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  7 ++--
 llvm/lib/Target/X86/X86ISelLowering.h         |  6 ++--
 10 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 00c36266a069f..f71fac0df137b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3353,8 +3353,10 @@ class TargetLoweringBase {
   /// Return true if pulling a binary operation into a select with an identity
   /// constant is profitable. This is the inverse of an IR transform.
   /// Example: X + (Cond ? Y : 0) --> Cond ? (X + Y) : X
-  virtual bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
-                                                    EVT VT) const {
+  virtual bool
+  shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                       unsigned SelectOpcode, SDValue X,
+                                       SDValue NonIdConstNode) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b175e35385ec6..7c8619aa29346 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2425,8 +2425,9 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
   if (ShouldCommuteOperands)
     std::swap(N0, N1);
 
-  // TODO: Should this apply to scalar select too?
-  if (N1.getOpcode() != ISD::VSELECT || !N1.hasOneUse())
+  unsigned SelOpcode = N1.getOpcode();
+  if ((SelOpcode != ISD::VSELECT && SelOpcode != ISD::SELECT) ||
+      !N1.hasOneUse())
     return SDValue();
 
   // We can't hoist all instructions because of immediate UB (not speculatable).
@@ -2439,17 +2440,22 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
   SDValue Cond = N1.getOperand(0);
   SDValue TVal = N1.getOperand(1);
   SDValue FVal = N1.getOperand(2);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // This transform increases uses of N0, so freeze it to be safe.
   // binop N0, (vselect Cond, IDC, FVal) --> vselect Cond, N0, (binop N0, FVal)
   unsigned OpNo = ShouldCommuteOperands ? 0 : 1;
-  if (isNeutralConstant(Opcode, N->getFlags(), TVal, OpNo)) {
+  if (isNeutralConstant(Opcode, N->getFlags(), TVal, OpNo) &&
+      TLI.shouldFoldSelectWithIdentityConstant(Opcode, VT, SelOpcode, N0,
+                                               FVal)) {
     SDValue F0 = DAG.getFreeze(N0);
     SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, FVal, N->getFlags());
     return DAG.getSelect(SDLoc(N), VT, Cond, F0, NewBO);
   }
   // binop N0, (vselect Cond, TVal, IDC) --> vselect Cond, (binop N0, TVal), N0
-  if (isNeutralConstant(Opcode, N->getFlags(), FVal, OpNo)) {
+  if (isNeutralConstant(Opcode, N->getFlags(), FVal, OpNo) &&
+      TLI.shouldFoldSelectWithIdentityConstant(Opcode, VT, SelOpcode, N0,
+                                               TVal)) {
     SDValue F0 = DAG.getFreeze(N0);
     SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, TVal, N->getFlags());
     return DAG.getSelect(SDLoc(N), VT, Cond, NewBO, F0);
@@ -2459,26 +2465,23 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
 }
 
 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
          "Unexpected binary operator");
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  auto BinOpcode = BO->getOpcode();
-  EVT VT = BO->getValueType(0);
-  if (TLI.shouldFoldSelectWithIdentityConstant(BinOpcode, VT)) {
-    if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, false))
-      return Sel;
+  if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, false))
+    return Sel;
 
-    if (TLI.isCommutativeBinOp(BO->getOpcode()))
-      if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, true))
-        return Sel;
-  }
+  if (TLI.isCommutativeBinOp(BO->getOpcode()))
+    if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, true))
+      return Sel;
 
   // Don't do this unless the old select is going away. We want to eliminate the
   // binary operator, not replace a binop with a select.
   // TODO: Handle ISD::SELECT_CC.
   unsigned SelOpNo = 0;
   SDValue Sel = BO->getOperand(0);
+  auto BinOpcode = BO->getOpcode();
   if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) {
     SelOpNo = 1;
     Sel = BO->getOperand(1);
@@ -2526,6 +2529,7 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
 
   SDLoc DL(Sel);
   SDValue NewCT, NewCF;
+  EVT VT = BO->getValueType(0);
 
   if (CanFoldNonConst) {
     // If CBO is an opaque constant, we can't rely on getNode to constant fold.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 771eee1b3fecf..9d254afa524df 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18040,8 +18040,10 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
 }
 
 bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
-    unsigned BinOpcode, EVT VT) const {
-  return VT.isScalableVector() && isTypeLegal(VT);
+    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
+    SDValue NonIdConstNode) const {
+  return VT.isScalableVector() && isTypeLegal(VT) &&
+         SelectOpcode == ISD::VSELECT;
 }
 
 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d51ef2be8631..ff3f7fcd2d7c1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -786,8 +786,10 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                          CombineLevel Level) const override;
 
-  bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
-                                            EVT VT) const override;
+  bool
+  shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                       unsigned SelectOpcode, SDValue X,
+                                       SDValue NonIdConstNode) const override;
 
   /// Returns true if it is beneficial to convert a load of a constant
   /// to just the constant itself.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2290ac2728c6d..9900f6e958498 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13960,9 +13960,11 @@ bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
   return false;
 }
 
-bool ARMTargetLowering::shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
-                                                             EVT VT) const {
-  return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT);
+bool ARMTargetLowering::shouldFoldSelectWithIdentityConstant(
+    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
+    SDValue NonIdConstNode) const {
+  return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
+         SelectOpcode == ISD::VSELECT;
 }
 
 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..dd366eca80461 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -758,8 +758,10 @@ class VectorType;
     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                            CombineLevel Level) const override;
 
-    bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
-                                              EVT VT) const override;
+    bool
+    shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                         unsigned SelectOpcode, SDValue X,
+                                         SDValue NonIdConstNode) const override;
 
     bool preferIncOfAddToSubOfNot(EVT VT) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 98fba9e86e88a..4d8c8a578f30e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2090,8 +2090,12 @@ bool RISCVTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
   return C && C->getAPIntValue().ule(10);
 }
 
-bool RISCVTargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
-                                                               EVT VT) const {
+bool RISCVTargetLowering::shouldFoldSelectWithIdentityConstant(
+    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
+    SDValue NonIdConstNode) const {
+  if (SelectOpcode != ISD::VSELECT)
+    return false;
+
   // Only enable for rvv.
   if (!VT.isVector() || !Subtarget.hasVInstructions())
     return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index baf1b2e4d8e6e..ed6e3dbbee797 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -585,8 +585,10 @@ class RISCVTargetLowering : public TargetLowering {
                                                 unsigned &NumIntermediates,
                                                 MVT &RegisterVT) const override;
 
-  bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
-                                            EVT VT) const override;
+  bool
+  shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                       unsigned SelectOpcode, SDValue X,
+                                       SDValue NonIdConstNode) const override;
 
   /// Return true if the given shuffle mask can be codegen'd directly, or if it
   /// should be stack expanded.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 993118c52564e..4f35cef3acf0b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35383,8 +35383,11 @@ bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
 }
 
-bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
-                                                             EVT VT) const {
+bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(
+    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
+    SDValue NonIdConstNode) const {
+  if (SelectOpcode != ISD::VSELECT)
+    return false;
   // TODO: This is too general. There are cases where pre-AVX512 codegen would
   //       benefit. The transform may also be profitable for scalar code.
   if (!Subtarget.hasAVX512())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 4a2b35e9efe7c..89b817e02ade7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1460,8 +1460,10 @@ namespace llvm {
     /// from i32 to i16.
     bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
 
-    bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
-                                              EVT VT) const override;
+    bool
+    shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                         unsigned SelectOpcode, SDValue X,
+                                         SDValue NonIdConstNode) const override;
 
     /// Given an intrinsic, checks if on the target the intrinsic will need to map
     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns

>From e0207b3714cf7f67c1cbe10a4daf973a57cac3b8 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Sun, 30 Mar 2025 07:45:47 +0200
Subject: [PATCH 2/3] [X86] Add Tests for Distributing AND/XOR over SELECT; NFC

---
 llvm/test/CodeGen/X86/bmi-select-distrib.ll | 915 ++++++++++++++++++++
 1 file changed, 915 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/bmi-select-distrib.ll

diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
new file mode 100644
index 0000000000000..1147561e22d06
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -0,0 +1,915 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64
+
+define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi1:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi2:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %bls, %a1
+  ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    negb %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi3:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    negb %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 -1, i32 %sub
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andb $1, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    cmpb $1, %bl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    retq
+  %sub = sub i64 0, %a1
+  %bls = select i1 %a0, i64 %sub, i64 -1
+  %ret = and i64 %a1, %bls
+  ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_neg_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_neg_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %sub = sub i16 0, %a1
+  %bls = select i1 %a0, i16 %sub, i16 -1
+  %ret = and i16 %a1, %bls
+  ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_neg_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_neg_v4xi32:
+; X86:       # %bb.0:
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    pxor %xmm1, %xmm1
+; X86-NEXT:    psubd %xmm0, %xmm1
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_v4xi32:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # %bb.2:
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB5_1:
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    psubd %xmm0, %xmm1
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+  %sub = sub <4 x i32> zeroinitializer, %a1
+  %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = and <4 x i32> %a1, %bls
+  ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_neg(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_neg:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_no_neg:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 %a1, 0
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_wrong_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_wrong_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 %sub, i32 1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_neg_different_op:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_different_op:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    negl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a2
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr2:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %bls, %a1
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    negb %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr3:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    negb %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 -1, i32 %sub
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr4:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 %a1, 1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andb $1, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    addl $-1, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb $1, %bl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leaq -1(%rsi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    retq
+  %sub = add i64 %a1, -1
+  %bls = select i1 %a0, i64 %sub, i64 -1
+  %ret = and i64 %a1, %bls
+  ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_sub_1_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    leal -1(%edx), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %sub = add i16 %a1, -1
+  %bls = select i1 %a0, i16 %sub, i16 -1
+  %ret = and i16 %a1, %bls
+  ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_sub_1_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_sub_1_v4xi32:
+; X86:       # %bb.0:
+; X86-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB15_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:  .LBB15_2:
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_v4xi32:
+; X64:       # %bb.0:
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    je .LBB15_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    paddd %xmm0, %xmm1
+; X64-NEXT:  .LBB15_2:
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+  %sub = add <4 x i32> %a1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = and <4 x i32> %a1, %bls
+  ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_sub_1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -2(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_no_sub_1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -2(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -2
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_wrong_const:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_wrong_const:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_sub_1_different_op:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    decl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_different_op:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rdx), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a2, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk1:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk2:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk2:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %bls, %a1
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk3:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk3:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 0, i32 %sub
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk4:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk4:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 %a1, 1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    cmovel %edi, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    leaq -1(%rsi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    retq
+  %sub = add i64 %a1, -1
+  %bls = select i1 %a0, i64 %sub, i64 0
+  %ret = xor i64 %a1, %bls
+  ret i64 %ret
+}
+
+; Negative test
+define i16 @xor_select_sub_1_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal -1(%ecx), %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %sub = add i16 %a1, -1
+  %bls = select i1 %a0, i16 %sub, i16 0
+  %ret = xor i16 %a1, %bls
+  ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @xor_select_sub_1_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_v4xi32:
+; X86:       # %bb.0:
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB25_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    xorps %xmm1, %xmm1
+; X86-NEXT:    xorps %xmm1, %xmm0
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB25_1:
+; X86-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_v4xi32:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    jne .LBB25_1
+; X64-NEXT:  # %bb.2:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    xorps %xmm1, %xmm0
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB25_1:
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    paddd %xmm0, %xmm1
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
+  %sub = add <4 x i32> %a1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> zeroinitializer
+  %ret = xor <4 x i32> %a1, %bls
+  ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @xor_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_no_sub_1:
+; X86:       # %bb.0:
+; X86-NEXT:    leal 1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_no_sub_1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal 1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, 1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @xor_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_wrong_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_wrong_const:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @xor_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: xor_select_sub_1_different_op:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%edx), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_different_op:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    leal -1(%rdx), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a2, -1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}

>From e1569b432ab7bdd4e73b22ab0d6958223d82c4db Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Sun, 30 Mar 2025 08:58:00 +0200
Subject: [PATCH 3/3] [X86] Distribute Certain Bitwise Operations over SELECT

InstCombine canonicalizes `(select P (and X (- X)) X)` to
`(and (select P (- X) umax) X)`. This is counterproductive for the X86
backend when BMI is available because we can encode `(and X (- X))`
using the `BLSI` instruction. A similar situation arises if we have
`(select P (and X (sub X 1)) X)` (prevents use of `BLSR` instruction) or
`(select P (xor X (sub X 1)) X)` (prevents use of `BLSMSK` instruction).

Trigger the inverse transformation in the X86 backend if BMI is
available and we can use the mentioned BMI instructions. This is done by
adjusting the `shouldFoldSelectWithIdentityConstant()` implementation
for the X86 backend. In this way, we get `(select P (and X (- X)) X)`
again, which enables the use of `BLSI` (similar for the other cases
described above).

Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi

Fixes #131587, fixes #133848.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp     |  21 +-
 llvm/test/CodeGen/X86/bmi-select-distrib.ll | 299 ++++++--------------
 2 files changed, 101 insertions(+), 219 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4f35cef3acf0b..a2c765ef94b8f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35386,8 +35387,26 @@ bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
 bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(
     unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
     SDValue NonIdConstNode) const {
-  if (SelectOpcode != ISD::VSELECT)
+  if (SelectOpcode == ISD::SELECT) {
+    if (VT.isVector())
+      return false;
+    if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
+      return false;
+    using namespace llvm::SDPatternMatch;
+    // BLSI
+    if (BinOpcode == ISD::AND && sd_match(NonIdConstNode, m_Neg(m_Specific(X))))
+      return true;
+    // BLSR
+    if (BinOpcode == ISD::AND &&
+        sd_match(NonIdConstNode, m_Add(m_Specific(X), m_AllOnes())))
+      return true;
+    // BLSMSK
+    if (BinOpcode == ISD::XOR &&
+        sd_match(NonIdConstNode, m_Add(m_Specific(X), m_AllOnes())))
+      return true;
+
     return false;
+  }
   // TODO: This is too general. There are cases where pre-AVX512 codegen would
   //       benefit. The transform may also be profitable for scalar code.
   if (!Subtarget.hasAVX512())
diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
index 1147561e22d06..466f877f57600 100644
--- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -5,29 +5,16 @@
 define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsil %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi1:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsil %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 0, %a1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -38,29 +25,16 @@ define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsil %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi2:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsil %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 0, %a1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -71,29 +45,16 @@ define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi3:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    negb %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsil %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi3:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    negb %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsil %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 0, %a1
   %bls = select i1 %a0, i32 -1, i32 %sub
@@ -104,39 +65,26 @@ define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    andb $1, %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    cmpb $1, %bl
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    andl %esi, %edx
 ; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %esi, %edx
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbq %rax, %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    blsiq %rsi, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub i64 0, %a1
   %bls = select i1 %a0, i64 %sub, i64 -1
@@ -306,28 +254,16 @@ define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) no
 define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr1:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -338,28 +274,16 @@ define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr2:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -370,28 +294,16 @@ define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr3:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    negb %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr3:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    negb %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 -1, i32 %sub
@@ -402,28 +314,16 @@ define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr4:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr4:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 %a1, 1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -434,38 +334,26 @@ define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    andb $1, %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    addl $-1, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb $1, %bl
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    andl %esi, %edx
 ; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovel %esi, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leaq -1(%rsi), %rcx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbq %rax, %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    blsrq %rsi, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = add i64 %a1, -1
   %bls = select i1 %a0, i64 %sub, i64 -1
@@ -633,21 +521,16 @@ define i32 @and_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2)
 define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk1:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk1:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 0
@@ -658,21 +541,16 @@ define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk2:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk2:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 0
@@ -683,21 +561,16 @@ define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk3:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk3:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 0, i32 %sub
@@ -708,21 +581,16 @@ define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk4:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk4:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 %a1, 1
   %bls = select i1 %a0, i32 %sub, i32 0
@@ -733,7 +601,6 @@ define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
 define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -741,23 +608,19 @@ define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovel %edi, %eax
-; X86-NEXT:    cmovel %edi, %edx
-; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovel %esi, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    leaq -1(%rsi), %rcx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskq %rsi, %rax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovneq %rcx, %rax
-; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = add i64 %a1, -1
   %bls = select i1 %a0, i64 %sub, i64 0