[llvm] [X86] Distribute Certain Bitwise Operations over SELECT (PR #136555)

Mon Apr 21 02:45:32 PDT 2025

https://github.com/mskamp created https://github.com/llvm/llvm-project/pull/136555

InstCombine canonicalizes `(select P (and X (- X)) X)` to
`(and (select P (- X) umax) X)`. This is counterproductive for the X86
backend when BMI is available because we can encode `(and X (- X))`
using the `BLSI` instruction. A similar situation arises if we have
`(select P (and X (sub X 1)) X)` (prevents use of `BLSR` instruction) or
`(select P (xor X (sub X 1)) X)` (prevents use of `BLSMSK` instruction).
    
Trigger the inverse transformation in the X86 backend if BMI is
available and we can use the mentioned BMI instructions. This is done by
overriding the appropriate `shouldFoldSelectWithIdentityConstant()`
overload. In this way, we get `(select P (and X (- X)) X)` again, which
enables the use of `BLSI` (similar for the other cases described above).
    
Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi
    
Fixes #131587, fixes #133848.

>From 806fdc796828ba9940072c04dbd3621f0e56d0b5 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Fri, 18 Apr 2025 11:04:22 +0200
Subject: [PATCH 1/3] [SDAG] Make Select-with-Identity-Fold More Flexible; NFC

This change introduces a new overload of the method
`shouldFoldSelectWithIdentityConstant()`, which takes the opcode of the
select node and the non-identity operand of the select node and is
called after all other checks have been performed. Moreover, this change
adjusts the precondition of the fold so that it would work for `SELECT`
nodes in addition to `VSELECT` nodes.

No functional change is intended because the default (and currently
only) implementation restricts the fold to a `VSELECT` node; the same
restriction as before.

The rationale of this change is to make more fine grained decisions
possible when to revert the InstCombine canonicalization of
`(select c (binop x y) y)` to `(binop (select c x idc) y)` in the
backends.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    | 15 ++++++++++++++-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 12 ++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 00c36266a069f..ace71bee0ac34 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3351,13 +3351,26 @@ class TargetLoweringBase {
   }
 
   /// Return true if pulling a binary operation into a select with an identity
-  /// constant is profitable. This is the inverse of an IR transform.
+  /// constant is profitable for the given binary operation and type. This is
+  /// the inverse of an IR transform.
   /// Example: X + (Cond ? Y : 0) --> Cond ? (X + Y) : X
   virtual bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
                                                     EVT VT) const {
     return false;
   }
 
+  /// Return true if pulling a binary operation into a select with an identity
+  /// constant is profitable for the given binary operation, select operation,
+  /// operand to the binary operation, and select operand that is not the
+  /// identity constant. This is a more fine-grained variant of the previous
+  /// overload that is called only if the previous overload returned true.
+  virtual bool
+  shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
+                                       unsigned SelectOpcode, SDValue X,
+                                       SDValue NonIdConstNode) const {
+    return SelectOpcode == ISD::VSELECT;
+  }
+
   /// Return true if it is beneficial to convert a load of a constant to
   /// just the constant itself.
   /// On some targets it might be more efficient to use a combination of
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b175e35385ec6..c67614f4aa759 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2425,8 +2425,9 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
   if (ShouldCommuteOperands)
     std::swap(N0, N1);
 
-  // TODO: Should this apply to scalar select too?
-  if (N1.getOpcode() != ISD::VSELECT || !N1.hasOneUse())
+  unsigned SelOpcode = N1.getOpcode();
+  if ((SelOpcode != ISD::VSELECT && SelOpcode != ISD::SELECT) ||
+      !N1.hasOneUse())
     return SDValue();
 
   // We can't hoist all instructions because of immediate UB (not speculatable).
@@ -2439,17 +2440,20 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
   SDValue Cond = N1.getOperand(0);
   SDValue TVal = N1.getOperand(1);
   SDValue FVal = N1.getOperand(2);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // This transform increases uses of N0, so freeze it to be safe.
   // binop N0, (vselect Cond, IDC, FVal) --> vselect Cond, N0, (binop N0, FVal)
   unsigned OpNo = ShouldCommuteOperands ? 0 : 1;
-  if (isNeutralConstant(Opcode, N->getFlags(), TVal, OpNo)) {
+  if (isNeutralConstant(Opcode, N->getFlags(), TVal, OpNo) &&
+      TLI.shouldFoldSelectWithIdentityConstant(Opcode, SelOpcode, N0, FVal)) {
     SDValue F0 = DAG.getFreeze(N0);
     SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, FVal, N->getFlags());
     return DAG.getSelect(SDLoc(N), VT, Cond, F0, NewBO);
   }
   // binop N0, (vselect Cond, TVal, IDC) --> vselect Cond, (binop N0, TVal), N0
-  if (isNeutralConstant(Opcode, N->getFlags(), FVal, OpNo)) {
+  if (isNeutralConstant(Opcode, N->getFlags(), FVal, OpNo) &&
+      TLI.shouldFoldSelectWithIdentityConstant(Opcode, SelOpcode, N0, TVal)) {
     SDValue F0 = DAG.getFreeze(N0);
     SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, TVal, N->getFlags());
     return DAG.getSelect(SDLoc(N), VT, Cond, NewBO, F0);

>From 733d9d47e3da2584748bb92c47047b6e602a49c6 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Sun, 30 Mar 2025 07:45:47 +0200
Subject: [PATCH 2/3] [X86] Add Tests for Distributing AND/XOR over SELECT; NFC

---
 llvm/test/CodeGen/X86/bmi-select-distrib.ll | 915 ++++++++++++++++++++
 1 file changed, 915 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/bmi-select-distrib.ll

diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
new file mode 100644
index 0000000000000..1147561e22d06
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -0,0 +1,915 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64
+
+define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi1:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi2:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %bls, %a1
+  ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    negb %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi3:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    negb %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 -1, i32 %sub
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andb $1, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    cmpb $1, %bl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_to_blsi_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    retq
+  %sub = sub i64 0, %a1
+  %bls = select i1 %a0, i64 %sub, i64 -1
+  %ret = and i64 %a1, %bls
+  ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_neg_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_neg_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %sub = sub i16 0, %a1
+  %bls = select i1 %a0, i16 %sub, i16 -1
+  %ret = and i16 %a1, %bls
+  ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_neg_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_neg_v4xi32:
+; X86:       # %bb.0:
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    pxor %xmm1, %xmm1
+; X86-NEXT:    psubd %xmm0, %xmm1
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_v4xi32:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # %bb.2:
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB5_1:
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    psubd %xmm0, %xmm1
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+  %sub = sub <4 x i32> zeroinitializer, %a1
+  %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = and <4 x i32> %a1, %bls
+  ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_neg(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_neg:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_no_neg:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 %a1, 0
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_wrong_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_wrong_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a1
+  %bls = select i1 %a0, i32 %sub, i32 1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_neg_different_op:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    negl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_neg_different_op:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    negl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 0, %a2
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr2:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %bls, %a1
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    negb %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr3:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    negb %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 -1, i32 %sub
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr4:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 %a1, 1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andb $1, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    addl $-1, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb $1, %bl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leaq -1(%rsi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbq %rax, %rax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    retq
+  %sub = add i64 %a1, -1
+  %bls = select i1 %a0, i64 %sub, i64 -1
+  %ret = and i64 %a1, %bls
+  ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_sub_1_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    leal -1(%edx), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %sub = add i16 %a1, -1
+  %bls = select i1 %a0, i16 %sub, i16 -1
+  %ret = and i16 %a1, %bls
+  ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_sub_1_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_sub_1_v4xi32:
+; X86:       # %bb.0:
+; X86-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB15_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:  .LBB15_2:
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_v4xi32:
+; X64:       # %bb.0:
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    je .LBB15_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    paddd %xmm0, %xmm1
+; X64-NEXT:  .LBB15_2:
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+  %sub = add <4 x i32> %a1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = and <4 x i32> %a1, %bls
+  ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_sub_1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -2(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_no_sub_1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -2(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -2
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_wrong_const:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_wrong_const:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_sub_1_different_op:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    decl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: and_select_sub_1_different_op:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rdx), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a2, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = and i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk1:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk2:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk2:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %bls, %a1
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk3:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk3:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 0, i32 %sub
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk4:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk4:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = sub i32 %a1, 1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    cmovel %edi, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    leaq -1(%rsi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    retq
+  %sub = add i64 %a1, -1
+  %bls = select i1 %a0, i64 %sub, i64 0
+  %ret = xor i64 %a1, %bls
+  ret i64 %ret
+}
+
+; Negative test
+define i16 @xor_select_sub_1_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal -1(%ecx), %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %sub = add i16 %a1, -1
+  %bls = select i1 %a0, i16 %sub, i16 0
+  %ret = xor i16 %a1, %bls
+  ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @xor_select_sub_1_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_v4xi32:
+; X86:       # %bb.0:
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB25_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    xorps %xmm1, %xmm1
+; X86-NEXT:    xorps %xmm1, %xmm0
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB25_1:
+; X86-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_v4xi32:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    jne .LBB25_1
+; X64-NEXT:  # %bb.2:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    xorps %xmm1, %xmm0
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB25_1:
+; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    paddd %xmm0, %xmm1
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
+  %sub = add <4 x i32> %a1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> zeroinitializer
+  %ret = xor <4 x i32> %a1, %bls
+  ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @xor_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_no_sub_1:
+; X86:       # %bb.0:
+; X86-NEXT:    leal 1(%eax), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_no_sub_1:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal 1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, 1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @xor_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_wrong_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_wrong_const:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rsi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb $1, %dil
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a1, -1
+  %bls = select i1 %a0, i32 %sub, i32 -1
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}
+
+; Negative test
+define i32 @xor_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: xor_select_sub_1_different_op:
+; X86:       # %bb.0:
+; X86-NEXT:    leal -1(%edx), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: xor_select_sub_1_different_op:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    leal -1(%rdx), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    retq
+  %sub = add i32 %a2, -1
+  %bls = select i1 %a0, i32 %sub, i32 0
+  %ret = xor i32 %a1, %bls
+  ret i32 %ret
+}

>From c6cb302016c8d32a0922466a23559fb72b1b2568 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Sun, 30 Mar 2025 08:58:00 +0200
Subject: [PATCH 3/3] [X86] Distribute Certain Bitwise Operations over SELECT

InstCombine canonicalizes `(select P (and X (- X)) X)` to
`(and (select P (- X) umax) X)`. This is counterproductive for the X86
backend when BMI is available because we can encode `(and X (- X))`
using the `BLSI` instruction. A similar situation arises if we have
`(select P (and X (sub X 1)) X)` (prevents use of `BLSR` instruction) or
`(select P (xor X (sub X 1)) X)` (prevents use of `BLSMSK` instruction).

Trigger the inverse transformation in the X86 backend if BMI is
available and we can use the mentioned BMI instructions. This is done by
overriding the appropriate `shouldFoldSelectWithIdentityConstant()`
overload. In this way, we get `(select P (and X (- X)) X)` again, which
enables the use of `BLSI` (similar for the other cases described above).

Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi

Fixes #131587, fixes #133848.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp     |  33 ++-
 llvm/lib/Target/X86/X86ISelLowering.h       |   5 +
 llvm/test/CodeGen/X86/bmi-select-distrib.ll | 299 ++++++--------------
 3 files changed, 118 insertions(+), 219 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 993118c52564e..b31b1c5c2811d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35385,18 +35386,48 @@ bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
 
 bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
                                                              EVT VT) const {
+  if (!VT.isVector()) {
+    if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
+      return false;
+    if (Opcode != ISD::AND && Opcode != ISD::XOR)
+      return false;
+    return true;
+  }
+
   // TODO: This is too general. There are cases where pre-AVX512 codegen would
   //       benefit. The transform may also be profitable for scalar code.
   if (!Subtarget.hasAVX512())
     return false;
   if (!Subtarget.hasVLX() && !VT.is512BitVector())
     return false;
-  if (!VT.isVector() || VT.getScalarType() == MVT::i1)
+  if (VT.getScalarType() == MVT::i1)
     return false;
 
   return true;
 }
 
+bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(
+    unsigned BinOpcode, unsigned SelectOpcode, SDValue X,
+    SDValue NonIdConstNode) const {
+  using namespace llvm::SDPatternMatch;
+
+  if (SelectOpcode == ISD::VSELECT)
+    return true;
+  // BLSI
+  if (BinOpcode == ISD::AND && sd_match(NonIdConstNode, m_Neg(m_Specific(X))))
+    return true;
+  // BLSR
+  if (BinOpcode == ISD::AND &&
+      sd_match(NonIdConstNode, m_Add(m_Specific(X), m_AllOnes())))
+    return true;
+  // BLSMSK
+  if (BinOpcode == ISD::XOR &&
+      sd_match(NonIdConstNode, m_Add(m_Specific(X), m_AllOnes())))
+    return true;
+
+  return false;
+}
+
 /// Targets can use this to indicate that they only support *some*
 /// VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 4a2b35e9efe7c..5c60136034226 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1463,6 +1463,11 @@ namespace llvm {
     bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
                                               EVT VT) const override;
 
+    bool
+    shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
+                                         unsigned SelectOpcode, SDValue X,
+                                         SDValue NonIdConstNode) const override;
+
     /// Given an intrinsic, checks if on the target the intrinsic will need to map
     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
     /// true and stores the intrinsic information into the IntrinsicInfo that was
diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
index 1147561e22d06..466f877f57600 100644
--- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -5,29 +5,16 @@
 define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsil %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi1:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsil %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 0, %a1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -38,29 +25,16 @@ define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsil %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi2:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsil %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 0, %a1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -71,29 +45,16 @@ define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi3:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    negb %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsil %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi3:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    negb %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsil %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 0, %a1
   %bls = select i1 %a0, i32 -1, i32 %sub
@@ -104,39 +65,26 @@ define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    andb $1, %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    cmpb $1, %bl
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    andl %esi, %edx
 ; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %esi, %edx
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_to_blsi_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbq %rax, %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    blsiq %rsi, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = sub i64 0, %a1
   %bls = select i1 %a0, i64 %sub, i64 -1
@@ -306,28 +254,16 @@ define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) no
 define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr1:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -338,28 +274,16 @@ define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr2:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -370,28 +294,16 @@ define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr3:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    negb %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr3:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    negb %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 -1, i32 %sub
@@ -402,28 +314,16 @@ define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr4:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    blsrl %eax, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr4:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    blsrl %esi, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 %a1, 1
   %bls = select i1 %a0, i32 %sub, i32 -1
@@ -434,38 +334,26 @@ define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    andb $1, %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    addl $-1, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb $1, %bl
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    andl %esi, %edx
 ; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovel %esi, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_to_blsr_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leaq -1(%rsi), %rcx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpb $1, %dil
-; X64-NEXT:    sbbq %rax, %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    blsrq %rsi, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = add i64 %a1, -1
   %bls = select i1 %a0, i64 %sub, i64 -1
@@ -633,21 +521,16 @@ define i32 @and_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2)
 define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk1:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk1:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 0
@@ -658,21 +541,16 @@ define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk2:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk2:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 %sub, i32 0
@@ -683,21 +561,16 @@ define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk3:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk3:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
   %sub = add i32 %a1, -1
   %bls = select i1 %a0, i32 0, i32 %sub
@@ -708,21 +581,16 @@ define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk4:
 ; X86:       # %bb.0:
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    blsmskl %eax, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk4:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    leal -1(%rsi), %ecx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskl %esi, %eax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %sub = sub i32 %a1, 1
   %bls = select i1 %a0, i32 %sub, i32 0
@@ -733,7 +601,6 @@ define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
 define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_to_blsmsk_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -741,23 +608,19 @@ define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovel %edi, %eax
-; X86-NEXT:    cmovel %edi, %edx
-; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovel %esi, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_to_blsmsk_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    leaq -1(%rsi), %rcx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    blsmskq %rsi, %rax
 ; X64-NEXT:    testb $1, %dil
-; X64-NEXT:    cmovneq %rcx, %rax
-; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %sub = add i64 %a1, -1
   %bls = select i1 %a0, i64 %sub, i64 0