[llvm] e03d216 - [X86] Use bit test instructions to optimize some logic atomic operations

Mon Feb 28 18:35:40 PST 2022

Author: Phoebe Wang
Date: 2022-03-01T09:57:08+08:00
New Revision: e03d216c28dfbda52afe7e1f6606cc5dafb3235e

URL: https://github.com/llvm/llvm-project/commit/e03d216c28dfbda52afe7e1f6606cc5dafb3235e
DIFF: https://github.com/llvm/llvm-project/commit/e03d216c28dfbda52afe7e1f6606cc5dafb3235e.diff

LOG: [X86] Use bit test instructions to optimize some logic atomic operations

This is to match GCC's optimizations: https://gcc.godbolt.org/z/3odh9e7WE

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120199

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/include/llvm/IR/IntrinsicsX86.td
    llvm/lib/CodeGen/AtomicExpandPass.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86InstrCompiler.td
    llvm/test/CodeGen/X86/atomic-bit-test.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index fbb9767e4da71..3104428925af6 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -253,7 +253,9 @@ class TargetLoweringBase {
     LLOnly,  // Expand the (load) instruction into just a load-linked, which has
              // greater atomic guarantees than a normal load.
     CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
-    MaskedIntrinsic, // Use a target-specific intrinsic for the LL/SC loop.
+    MaskedIntrinsic,  // Use a target-specific intrinsic for the LL/SC loop.
+    BitTestIntrinsic, // Use a target-specific intrinsic for special bit
+                      // operations; used by X86.
   };
 
   /// Enum that specifies when a multiplication should be expanded.
@@ -1951,6 +1953,14 @@ class TargetLoweringBase {
     llvm_unreachable("Masked atomicrmw expansion unimplemented on this target");
   }
 
+  /// Perform a bit test atomicrmw using a target-specific intrinsic. This
+  /// represents the combined bit test intrinsic which will be lowered at a late
+  /// stage by the backend.
+  virtual void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
+    llvm_unreachable(
+        "Bit test atomicrmw expansion unimplemented on this target");
+  }
+
   /// Perform a masked cmpxchg using a target-specific intrinsic. This
   /// represents the core LL/SC loop which will be lowered at a late stage by
   /// the backend.

diff  --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 8de737a1c7a58..e954fe7cb8f67 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -62,6 +62,16 @@ let TargetPrefix = "x86" in {
               Intrinsic<[llvm_i32_ty], [], []>;
 }
 
+// Lock bit test.
+let TargetPrefix = "x86" in {
+  def int_x86_atomic_bts : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+  def int_x86_atomic_btc : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+  def int_x86_atomic_btr : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+}
+
 //===----------------------------------------------------------------------===//
 // CET SS
 let TargetPrefix = "x86" in {

diff  --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 4838f6da750dd..897018d202103 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -621,6 +621,10 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
     expandAtomicRMWToMaskedIntrinsic(AI);
     return true;
   }
+  case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {
+    TLI->emitBitTestAtomicRMWIntrinsic(AI);
+    return true;
+  }
   default:
     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
   }

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3fa92928e93d0..6ec80d0b02369 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5442,6 +5442,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Info.align = Align(1);
       Info.flags |= MachineMemOperand::MOLoad;
       return true;
+    case Intrinsic::x86_atomic_bts:
+    case Intrinsic::x86_atomic_btc:
+    case Intrinsic::x86_atomic_btr: {
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.ptrVal = I.getArgOperand(0);
+      unsigned Size = I.getType()->getScalarSizeInBits();
+      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
+      Info.align = Align(Size);
+      Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+                    MachineMemOperand::MOVolatile;
+      return true;
+    }
     }
     return false;
   }
@@ -27510,6 +27522,30 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
+    case Intrinsic::x86_atomic_bts:
+    case Intrinsic::x86_atomic_btc:
+    case Intrinsic::x86_atomic_btr: {
+      SDLoc DL(Op);
+      MVT VT = Op.getSimpleValueType();
+      SDValue Chain = Op.getOperand(0);
+      SDValue Op1 = Op.getOperand(2);
+      SDValue Op2 = Op.getOperand(3);
+      unsigned Opc = IntNo == Intrinsic::x86_atomic_bts   ? X86ISD::LBTS
+                     : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
+                                                          : X86ISD::LBTR;
+      SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
+      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
+      SDValue Res =
+          DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
+                                  {Chain, Op1, Op2, Size}, VT, MMO);
+      Chain = Res.getValue(1);
+      Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
+      unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+      if (Imm)
+        Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                          DAG.getShiftAmountConstant(Imm, VT, DL));
+      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
+    }
     }
     return SDValue();
   }
@@ -30431,6 +30467,65 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
                                  : AtomicExpansionKind::None;
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
+  // If the atomicrmw's result isn't actually used, we can just add a "lock"
+  // prefix to a normal instruction for these operations.
+  if (AI->use_empty())
+    return AtomicExpansionKind::None;
+
+  // If the atomicrmw's result is used by a single bit AND, we may use
+  // bts/btr/btc instruction for these operations.
+  auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+  Instruction *I = AI->user_back();
+  if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
+      AI->getParent() != I->getParent())
+    return AtomicExpansionKind::CmpXChg;
+  // The following instruction must be a AND single bit.
+  auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
+  unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
+  if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
+    return AtomicExpansionKind::CmpXChg;
+
+  if (AI->getOperation() == AtomicRMWInst::And)
+    return ~C1->getValue() == C2->getValue()
+               ? AtomicExpansionKind::BitTestIntrinsic
+               : AtomicExpansionKind::CmpXChg;
+
+  return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
+                  : AtomicExpansionKind::CmpXChg;
+}
+
+void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
+  IRBuilder<> Builder(AI);
+  Intrinsic::ID IID = Intrinsic::not_intrinsic;
+  switch (AI->getOperation()) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Or:
+    IID = Intrinsic::x86_atomic_bts;
+    break;
+  case AtomicRMWInst::Xor:
+    IID = Intrinsic::x86_atomic_btc;
+    break;
+  case AtomicRMWInst::And:
+    IID = Intrinsic::x86_atomic_btr;
+    break;
+  }
+  Instruction *I = AI->user_back();
+  LLVMContext &Ctx = AI->getContext();
+  unsigned Imm =
+      countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
+  Function *BitTest =
+      Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
+  Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
+                                          Type::getInt8PtrTy(Ctx));
+  Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+  I->replaceAllUsesWith(Result);
+  I->eraseFromParent();
+  AI->eraseFromParent();
+}
+
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
@@ -30455,10 +30550,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   case AtomicRMWInst::Or:
   case AtomicRMWInst::And:
   case AtomicRMWInst::Xor:
-    // If the atomicrmw's result isn't actually used, we can just add a "lock"
-    // prefix to a normal instruction for these operations.
-    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
-                            : AtomicExpansionKind::None;
+    return shouldExpandLogicAtomicRMWInIR(AI);
   case AtomicRMWInst::Nand:
   case AtomicRMWInst::Max:
   case AtomicRMWInst::Min:
@@ -32939,6 +33031,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LOR)
   NODE_NAME_CASE(LXOR)
   NODE_NAME_CASE(LAND)
+  NODE_NAME_CASE(LBTS)
+  NODE_NAME_CASE(LBTC)
+  NODE_NAME_CASE(LBTR)
   NODE_NAME_CASE(VZEXT_MOVL)
   NODE_NAME_CASE(VZEXT_LOAD)
   NODE_NAME_CASE(VEXTRACT_STORE)

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 99299dc884c98..80ffbf9a3dd0b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -787,6 +787,9 @@ namespace llvm {
     LOR,
     LXOR,
     LAND,
+    LBTS,
+    LBTC,
+    LBTR,
 
     // Load, scalar_to_vector, and zero extend.
     VZEXT_LOAD,
@@ -1640,6 +1643,9 @@ namespace llvm {
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+    void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
     LoadInst *
     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

diff  --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index fe8126f3dc7d8..abad628bebd5c 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -839,6 +839,38 @@ let Predicates = [UseIncDec] in {
   def : Pat<(X86lock_sub addr:$dst, (i64 -1)), (LOCK_INC64m addr:$dst)>;
 }
 
+// Atomic bit test.
+def X86LBTest : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+                                     SDTCisVT<2, i8>, SDTCisVT<3, i32>]>;
+def x86bts : SDNode<"X86ISD::LBTS", X86LBTest,
+                    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86btc : SDNode<"X86ISD::LBTC", X86LBTest,
+                    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86btr : SDNode<"X86ISD::LBTR", X86LBTest,
+                    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+
+multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
+  let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+      SchedRW = [WriteBitTestSetRegRMW]  in {
+    def 16m : Ii8<0xBA, Form, (outs), (ins i16mem:$src1, i8imm:$src2),
+                  !strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 16)))]>,
+              OpSize16, TB, LOCK;
+    def 32m : Ii8<0xBA, Form, (outs), (ins i32mem:$src1, i8imm:$src2),
+                  !strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 32)))]>,
+              OpSize32, TB, LOCK;
+    def 64m : RIi8<0xBA, Form, (outs), (ins i64mem:$src1, i8imm:$src2),
+                   !strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"),
+                   [(set EFLAGS, (!cast<SDNode>("x86" # s) addr:$src1, timm:$src2, (i32 64)))]>,
+              TB, LOCK;
+  }
+}
+
+defm LOCK_BTS : ATOMIC_LOGIC_OP<MRM5m, "bts">;
+defm LOCK_BTC : ATOMIC_LOGIC_OP<MRM7m, "btc">;
+defm LOCK_BTR : ATOMIC_LOGIC_OP<MRM6m, "btr">;
+
 // Atomic compare and swap.
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic, SDPatternOperator frag> {

diff  --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll
index da1b7aa14e707..4372fcf6be8a9 100644
--- a/llvm/test/CodeGen/X86/atomic-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll
@@ -9,35 +9,17 @@
 define i16 @bts1() nounwind {
 ; X86-LABEL: bts1:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB0_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $1, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB0_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB0_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $1, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB0_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -49,35 +31,19 @@ entry:
 define i16 @bts2() nounwind {
 ; X86-LABEL: bts2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $2, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB1_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $2, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $1, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB1_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $2, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB1_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $2, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $1, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -89,35 +55,19 @@ entry:
 define i16 @bts15() nounwind {
 ; X86-LABEL: bts15:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB2_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $15, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $15, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts15:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB2_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $32768, %ecx # imm = 0x8000
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB2_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $15, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -129,30 +79,18 @@ entry:
 define i32 @bts31() nounwind {
 ; X86-LABEL: bts31:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB3_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB3_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl $31, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $31, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bts31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $-2147483648, %ecx # imm = 0x80000000
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB3_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl $31, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $31, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw or i32* @v32, i32 2147483648 monotonic, align 4
@@ -185,17 +123,10 @@ define i64 @bts63() nounwind {
 ;
 ; X64-LABEL: bts63:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    movq v64(%rip), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB4_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    orq %rcx, %rdx
-; X64-NEXT:    lock cmpxchgq %rdx, v64(%rip)
-; X64-NEXT:    jne .LBB4_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsq $63, v64(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shlq $63, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw or i64* @v64, i64 -9223372036854775808 monotonic, align 8
@@ -206,35 +137,17 @@ entry:
 define i16 @btc1() nounwind {
 ; X86-LABEL: btc1:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $1, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB5_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB5_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $1, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB5_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -246,35 +159,19 @@ entry:
 define i16 @btc2() nounwind {
 ; X86-LABEL: btc2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB6_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $2, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB6_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $2, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw $1, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB6_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $2, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB6_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $2, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw $1, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -286,35 +183,19 @@ entry:
 define i16 @btc15() nounwind {
 ; X86-LABEL: btc15:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB7_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB7_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw $15, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $15, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc15:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB7_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $32768, %ecx # imm = 0x8000
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB7_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw $15, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -326,30 +207,18 @@ entry:
 define i32 @btc31() nounwind {
 ; X86-LABEL: btc31:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB8_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB8_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcl $31, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $31, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btc31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB8_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl $-2147483648, %ecx # imm = 0x80000000
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB8_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcl $31, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $31, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw xor i32* @v32, i32 2147483648 monotonic, align 4
@@ -382,17 +251,10 @@ define i64 @btc63() nounwind {
 ;
 ; X64-LABEL: btc63:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X64-NEXT:    movq v64(%rip), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB9_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    xorq %rcx, %rdx
-; X64-NEXT:    lock cmpxchgq %rdx, v64(%rip)
-; X64-NEXT:    jne .LBB9_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcq $63, v64(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shlq $63, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw xor i64* @v64, i64 -9223372036854775808 monotonic, align 8
@@ -403,35 +265,17 @@ entry:
 define i16 @btr1() nounwind {
 ; X86-LABEL: btr1:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB10_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $65534, %ecx # imm = 0xFFFE
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB10_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB10_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $65534, %ecx # imm = 0xFFFE
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB10_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -443,35 +287,19 @@ entry:
 define i16 @btr2() nounwind {
 ; X86-LABEL: btr2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB11_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $65533, %ecx # imm = 0xFFFD
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB11_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $2, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw $1, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB11_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $65533, %ecx # imm = 0xFFFD
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB11_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $2, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw $1, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -483,35 +311,19 @@ entry:
 define i16 @btr15() nounwind {
 ; X86-LABEL: btr15:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB12_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB12_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw $15, v16
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $15, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr15:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB12_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB12_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw $15, v16(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $15, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -523,30 +335,18 @@ entry:
 define i32 @btr31() nounwind {
 ; X86-LABEL: btr31:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB13_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB13_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrl $31, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $31, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: btr31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB13_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB13_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrl $31, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $31, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw and i32* @v32, i32 2147483647 monotonic, align 4
@@ -585,18 +385,10 @@ define i64 @btr63() nounwind {
 ;
 ; X64-LABEL: btr63:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    movq v64(%rip), %rax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB14_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    lock cmpxchgq %rdx, v64(%rip)
-; X64-NEXT:    jne .LBB14_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    incq %rcx
-; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrq $63, v64(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shlq $63, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = atomicrmw and i64* @v64, i64 9223372036854775807 monotonic, align 8
@@ -655,36 +447,18 @@ entry:
 define i16 @multi_use2() nounwind {
 ; X86-LABEL: multi_use2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl v16, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB16_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $1, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, v16
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB16_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsw $0, v16
+; X86-NEXT:    setb %al
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: multi_use2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzwl v16(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB16_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $1, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $rax
-; X64-NEXT:    lock cmpxchgw %cx, v16(%rip)
-; X64-NEXT:    # kill: def $ax killed $ax def $rax
-; X64-NEXT:    jne .LBB16_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsw $0, v16(%rip)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    leal (%rax,%rax,2), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
@@ -764,39 +538,23 @@ declare void @foo()
 define void @no_and_cmp0_fold() nounwind {
 ; X86-LABEL: no_and_cmp0_fold:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB18_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl $8, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, v32
-; X86-NEXT:    jne .LBB18_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    lock btsl $3, v32
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB18_3
-; X86-NEXT:  # %bb.4: # %if.end
+; X86-NEXT:    je .LBB18_1
+; X86-NEXT:  # %bb.2: # %if.end
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_3: # %if.then
+; X86-NEXT:  .LBB18_1: # %if.then
 ;
 ; X64-LABEL: no_and_cmp0_fold:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB18_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $8, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB18_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-NEXT:    lock btsl $3, v32(%rip)
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testb %al, %al
-; X64-NEXT:    je .LBB18_3
-; X64-NEXT:  # %bb.4: # %if.end
+; X64-NEXT:    je .LBB18_1
+; X64-NEXT:  # %bb.2: # %if.end
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %if.then
+; X64-NEXT:  .LBB18_1: # %if.then
 entry:
   %0 = atomicrmw or i32* @v32, i32 8 monotonic, align 4
   %and = and i32 %0, 8
@@ -815,32 +573,20 @@ define i32 @split_hoist_and(i32 %0) nounwind {
 ; X86-LABEL: split_hoist_and:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl v32, %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB19_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl $8, %edx
-; X86-NEXT:    lock cmpxchgl %edx, v32
-; X86-NEXT:    jne .LBB19_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl $3, v32
+; X86-NEXT:    setb %al
+; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    andl $8, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: split_hoist_and:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl v32(%rip), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB19_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl $8, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, v32(%rip)
-; X64-NEXT:    jne .LBB19_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl $3, v32(%rip)
+; X64-NEXT:    setb %al
+; X64-NEXT:    shll $3, %eax
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    andl $8, %eax
 ; X64-NEXT:    retq
   %2 = atomicrmw or i32* @v32, i32 8 monotonic, align 4
   %3 = tail call i32 @llvm.ctlz.i32(i32 %0, i1 false)