[llvm] 0b74e34 - Transform AtomicRMW logic operations to BT{R|C|S} if only changing/testing a single bit.

Mon Jan 16 22:09:51 PST 2023

Author: Noah Goldstein
Date: 2023-01-16T22:05:47-08:00
New Revision: 0b74e34938ba6cb89cbd197835219a6970ebaf39

URL: https://github.com/llvm/llvm-project/commit/0b74e34938ba6cb89cbd197835219a6970ebaf39
DIFF: https://github.com/llvm/llvm-project/commit/0b74e34938ba6cb89cbd197835219a6970ebaf39.diff

LOG: Transform AtomicRMW logic operations to BT{R|C|S} if only changing/testing a single bit.

This is essentially expanding on the optimizations added on: D120199
but applies the optimization to cases where the bit being changed /
tested is not am IMM but is a provable power of 2.

The only case currently added for cases like:
`__atomic_fetch_xor(p, 1 << c, __ATOMIC_RELAXED) & (1 << c)`

Which instead of using a `cmpxchg` loop can be done with `btcl; setcc; shl`.

There are still a variety of missed cases that could/should be
addressed in the future. This commit documents many of those
cases with Todos.

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D140939

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsX86.td
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86InstrCompiler.td
    llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
    llvm/test/CodeGen/X86/atomic-rm-bit-test.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 4d04e8fbfbb11..239f15809e29e 100644

--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -70,6 +70,14 @@ let TargetPrefix = "x86" in {
                                      [ImmArg<ArgIndex<1>>]>;
   def int_x86_atomic_btr : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
                                      [ImmArg<ArgIndex<1>>]>;
+  def int_x86_atomic_bts_rm  : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
+                                         []>;
+  def int_x86_atomic_btc_rm  : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
+                                         []>;
+  def int_x86_atomic_btr_rm  : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
+                                         []>;
+
+
 }
 
 // Lock binary arith with CC.

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e3e04a2fd68c9..c88c66d8b2edc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5654,6 +5654,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                     MachineMemOperand::MOVolatile;
       return true;
     }
+    case Intrinsic::x86_atomic_bts_rm:
+    case Intrinsic::x86_atomic_btc_rm:
+    case Intrinsic::x86_atomic_btr_rm: {
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.ptrVal = I.getArgOperand(0);
+      unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
+      Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
+      Info.align = Align(Size);
+      Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+                    MachineMemOperand::MOVolatile;
+      return true;
+    }
     case Intrinsic::x86_aadd32:
     case Intrinsic::x86_aadd64:
     case Intrinsic::x86_aand32:
@@ -28364,6 +28376,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
+    case Intrinsic::x86_atomic_bts_rm:
+    case Intrinsic::x86_atomic_btc_rm:
+    case Intrinsic::x86_atomic_btr_rm: {
+      SDLoc DL(Op);
+      MVT VT = Op.getSimpleValueType();
+      SDValue Chain = Op.getOperand(0);
+      SDValue Op1 = Op.getOperand(2);
+      SDValue Op2 = Op.getOperand(3);
+      unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm   ? X86ISD::LBTS_RM
+                     : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
+                                                             : X86ISD::LBTR_RM;
+      MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
+      SDValue Res =
+          DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
+                                  {Chain, Op1, Op2}, VT, MMO);
+      Chain = Res.getValue(1);
+      Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
+      return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
+    }
     case Intrinsic::x86_atomic_bts:
     case Intrinsic::x86_atomic_btc:
     case Intrinsic::x86_atomic_btr: {
@@ -31401,6 +31432,75 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
                                  : AtomicExpansionKind::None;
 }
 
+enum BitTestKind : unsigned {
+  UndefBit,
+  ConstantBit,
+  NotConstantBit,
+  ShiftBit,
+  NotShiftBit
+};
+
+static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
+  using namespace llvm::PatternMatch;
+  BitTestKind BTK = UndefBit;
+  auto *C = dyn_cast<ConstantInt>(V);
+  if (C) {
+    // Check if V is a power of 2 or NOT power of 2.
+    if (isPowerOf2_64(C->getZExtValue()))
+      BTK = ConstantBit;
+    else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
+      BTK = NotConstantBit;
+    return {V, BTK};
+  }
+
+  // Check if V is some power of 2 pattern known to be non-zero
+  auto *I = dyn_cast<Instruction>(V);
+  if (I) {
+    bool Not = false;
+    // Check if we have a NOT
+    Value *PeekI;
+    if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
+        match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
+      Not = true;
+      I = dyn_cast<Instruction>(PeekI);
+      assert(I != nullptr);
+    }
+    // We can only use 1 << X without more sophisticated analysis. C << X where
+    // C is a power of 2 but not 1 can result in zero which cannot be translated
+    // to bittest. Likewise any C >> X (either arith or logical) can be zero.
+    if (I->getOpcode() == Instruction::Shl) {
+      // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
+      // -X` and some other provable power of 2 patterns that we can use CTZ on
+      // may be profitable.
+      // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
+      // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
+      // be provably a non-zero power of 2.
+      // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
+      // transformable to bittest.
+      auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
+      if (!ShiftVal)
+        return {nullptr, UndefBit};
+      if (ShiftVal->equalsInt(1))
+        BTK = Not ? NotShiftBit : ShiftBit;
+
+      if (BTK == UndefBit)
+        return {nullptr, UndefBit};
+
+      Value *BitV = I->getOperand(1);
+
+      Value *AndOp;
+      const APInt *AndC;
+      if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
+        // Read past a shiftmask instruction to find count
+        if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
+          BitV = AndOp;
+      }
+      return {BitV, BTK};
+    }
+  }
+  return {nullptr, UndefBit};
+}
+
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
   // If the atomicrmw's result isn't actually used, we can just add a "lock"
@@ -31410,51 +31510,138 @@ X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   // If the atomicrmw's result is used by a single bit AND, we may use
   // bts/btr/btc instruction for these operations.
-  auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+  // Note: InstCombinePass can cause a de-optimization here. It replaces the
+  // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
+  // (depending on CC). This pattern can only use bts/btr/btc but we don't
+  // detect it.
   Instruction *I = AI->user_back();
-  if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
+  auto BitChange = FindSingleBitChange(AI->getValOperand());
+  if (BitChange.second == UndefBit || !AI->hasOneUse() ||
+      I->getOpcode() != Instruction::And ||
+      AI->getType()->getPrimitiveSizeInBits() == 8 ||
       AI->getParent() != I->getParent())
     return AtomicExpansionKind::CmpXChg;
+
+  assert(I->getOperand(0) == AI);
   // The following instruction must be a AND single bit.
-  auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
-  unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
-  if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
+  if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
+    auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
+    assert(C1 != nullptr);
+    auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
+      return AtomicExpansionKind::CmpXChg;
+    }
+    if (AI->getOperation() == AtomicRMWInst::And) {
+      return ~C1->getValue() == C2->getValue()
+                 ? AtomicExpansionKind::BitTestIntrinsic
+                 : AtomicExpansionKind::CmpXChg;
+    }
+    return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
+                    : AtomicExpansionKind::CmpXChg;
+  }
+
+  assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
+
+  auto BitTested = FindSingleBitChange(I->getOperand(1));
+  if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
+    return AtomicExpansionKind::CmpXChg;
+
+  assert(BitChange.first != nullptr && BitTested.first != nullptr);
+
+  // If shift amounts are not the same we can't use BitTestIntrinsic.
+  if (BitChange.first != BitTested.first)
     return AtomicExpansionKind::CmpXChg;
 
+  // If atomic AND need to be masking all be one bit and testing the one bit
+  // unset in the mask.
   if (AI->getOperation() == AtomicRMWInst::And)
-    return ~C1->getValue() == C2->getValue()
+    return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
                ? AtomicExpansionKind::BitTestIntrinsic
                : AtomicExpansionKind::CmpXChg;
 
-  return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
-                  : AtomicExpansionKind::CmpXChg;
+  // If atomic XOR/OR need to be setting and testing the same bit.
+  return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
+             ? AtomicExpansionKind::BitTestIntrinsic
+             : AtomicExpansionKind::CmpXChg;
 }
 
 void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
-  Intrinsic::ID IID = Intrinsic::not_intrinsic;
+  Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
+  Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
   switch (AI->getOperation()) {
   default:
     llvm_unreachable("Unknown atomic operation");
   case AtomicRMWInst::Or:
-    IID = Intrinsic::x86_atomic_bts;
+    IID_C = Intrinsic::x86_atomic_bts;
+    IID_I = Intrinsic::x86_atomic_bts_rm;
     break;
   case AtomicRMWInst::Xor:
-    IID = Intrinsic::x86_atomic_btc;
+    IID_C = Intrinsic::x86_atomic_btc;
+    IID_I = Intrinsic::x86_atomic_btc_rm;
     break;
   case AtomicRMWInst::And:
-    IID = Intrinsic::x86_atomic_btr;
+    IID_C = Intrinsic::x86_atomic_btr;
+    IID_I = Intrinsic::x86_atomic_btr_rm;
     break;
   }
   Instruction *I = AI->user_back();
   LLVMContext &Ctx = AI->getContext();
-  unsigned Imm =
-      countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
-  Function *BitTest =
-      Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
                                           Type::getInt8PtrTy(Ctx));
-  Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+  Function *BitTest = nullptr;
+  Value *Result = nullptr;
+  auto BitTested = FindSingleBitChange(AI->getValOperand());
+  assert(BitTested.first != nullptr);
+  if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
+    auto *C = dyn_cast<ConstantInt>(I->getOperand(1));
+    assert(C != nullptr);
+
+    BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
+
+    unsigned Imm = countTrailingZeros(C->getZExtValue());
+    Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+  } else {
+    BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
+
+    assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
+
+    Value *SI = BitTested.first;
+    assert(SI != nullptr);
+
+    // BT{S|R|C} on memory operand don't modulo bit position so we need to
+    // mask it.
+    unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
+    Value *BitPos =
+        Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
+    // Todo(1): In many cases it may be provable that SI is less than
+    // ShiftBits in which case this mask is unnecessary
+    // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
+    // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
+    // favor of just a raw BT{S|R|C}.
+
+    Result = Builder.CreateCall(BitTest, {Addr, BitPos});
+    Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
+
+    // If the result is only used for zero/non-zero status then we don't need to
+    // shift value back. Otherwise do so.
+    for (auto It = I->user_begin(); It != I->user_end(); ++It) {
+      if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
+        if (ICmp->isEquality()) {
+          auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
+          auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
+          if (C0 || C1) {
+            assert(C0 == nullptr || C1 == nullptr);
+            if ((C0 ? C0 : C1)->isZero())
+              continue;
+          }
+        }
+      }
+      Result = Builder.CreateShl(Result, BitPos);
+      break;
+    }
+  }
+
   I->replaceAllUsesWith(Result);
   I->eraseFromParent();
   AI->eraseFromParent();
@@ -34242,6 +34429,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LBTS)
   NODE_NAME_CASE(LBTC)
   NODE_NAME_CASE(LBTR)
+  NODE_NAME_CASE(LBTS_RM)
+  NODE_NAME_CASE(LBTC_RM)
+  NODE_NAME_CASE(LBTR_RM)
   NODE_NAME_CASE(AADD)
   NODE_NAME_CASE(AOR)
   NODE_NAME_CASE(AXOR)

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b727725613864..c08227b5b383a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -798,6 +798,9 @@ namespace llvm {
     LBTS,
     LBTC,
     LBTR,
+    LBTS_RM,
+    LBTC_RM,
+    LBTR_RM,
 
     /// RAO arithmetic instructions.
     /// OUTCHAIN = AADD(INCHAIN, PTR, RHS)

diff  --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 823784ce7e989..8fddd0037999d 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -864,6 +864,17 @@ def x86btc : SDNode<"X86ISD::LBTC", X86LBTest,
 def x86btr : SDNode<"X86ISD::LBTR", X86LBTest,
                     [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
 
+def X86LBTestRM : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+                                       SDTCisInt<2>]>;
+
+def x86_rm_bts : SDNode<"X86ISD::LBTS_RM", X86LBTestRM,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86_rm_btc : SDNode<"X86ISD::LBTC_RM", X86LBTestRM,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def x86_rm_btr : SDNode<"X86ISD::LBTR_RM", X86LBTestRM,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+
+
 multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
   let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
       SchedRW = [WriteBitTestSetRegRMW]  in {
@@ -882,10 +893,33 @@ multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
   }
 }
 
+multiclass ATOMIC_LOGIC_OP_RM<bits<8> Opc8, string s> {
+  let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+      SchedRW = [WriteBitTestSetRegRMW]  in {
+    def 16rm : Ii8<Opc8, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                  !strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR16:$src2))]>,
+               OpSize16, TB, LOCK;
+    def 32rm : Ii8<Opc8, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                  !strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"),
+                  [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR32:$src2))]>,
+               OpSize32, TB, LOCK;
+    def 64rm : RIi8<Opc8, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                   !strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"),
+                   [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR64:$src2))]>,
+               TB, LOCK;
+  }
+}
+
+
 defm LOCK_BTS : ATOMIC_LOGIC_OP<MRM5m, "bts">;
 defm LOCK_BTC : ATOMIC_LOGIC_OP<MRM7m, "btc">;
 defm LOCK_BTR : ATOMIC_LOGIC_OP<MRM6m, "btr">;
 
+defm LOCK_BTS_RM : ATOMIC_LOGIC_OP_RM<0xAB, "bts">;
+defm LOCK_BTC_RM : ATOMIC_LOGIC_OP_RM<0xBB, "btc">;
+defm LOCK_BTR_RM : ATOMIC_LOGIC_OP_RM<0xB3, "btr">;
+
 // Atomic compare and swap.
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic, SDPatternOperator frag> {

diff  --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
index fbaf2303df99b..f7ebf302adb6d 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
@@ -5,19 +5,12 @@ define i64 @atomic_shl1_xor_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_xor_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btcq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    xorq %rdx, %rcx
-; CHECK-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -85,20 +78,12 @@ define i64 @atomic_shl1_small_mask_xor_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_small_mask_xor_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    andb $31, %cl
-; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    andl $31, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btcq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB3_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    xorq %rdx, %rcx
-; CHECK-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; CHECK-NEXT:    jne .LBB3_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %rem = and i64 %c, 31
@@ -112,21 +97,12 @@ define i64 @atomic_shl1_mask0_xor_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask0_xor_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB4_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    xorq %rdx, %rsi
-; CHECK-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; CHECK-NEXT:    jne .LBB4_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btcq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -141,21 +117,12 @@ define i64 @atomic_shl1_mask1_xor_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask1_xor_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB5_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    xorq %rdx, %rsi
-; CHECK-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; CHECK-NEXT:    jne .LBB5_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btcq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -170,19 +137,12 @@ define i64 @atomic_shl1_mask01_xor_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask01_xor_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btcq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    xorq %rdx, %rcx
-; CHECK-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; CHECK-NEXT:    jne .LBB6_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -701,21 +661,12 @@ define i64 @atomic_shl1_and_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_and_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq $-2, %rsi
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btrq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    rolq %cl, %rsi
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB24_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    andq %rsi, %rcx
-; CHECK-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; CHECK-NEXT:    jne .LBB24_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -788,22 +739,12 @@ define i64 @atomic_shl1_small_mask_and_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_small_mask_and_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    andb $31, %cl
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq $-2, %rsi
+; CHECK-NEXT:    andl $31, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btrq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    rolq %cl, %rsi
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB27_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    andq %rsi, %rcx
-; CHECK-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; CHECK-NEXT:    jne .LBB27_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %rem = and i64 %c, 31
@@ -818,21 +759,12 @@ define i64 @atomic_shl1_mask0_and_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask0_and_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movq $-2, %rdx
-; CHECK-NEXT:    rolq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB28_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    andq %rdx, %rsi
-; CHECK-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; CHECK-NEXT:    jne .LBB28_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btrq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -848,21 +780,12 @@ define i64 @atomic_shl1_mask1_and_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask1_and_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movq $-2, %rdx
-; CHECK-NEXT:    rolq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB29_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    andq %rdx, %rsi
-; CHECK-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; CHECK-NEXT:    jne .LBB29_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btrq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %shl = shl nuw i64 1, %c
@@ -878,21 +801,12 @@ define i64 @atomic_shl1_mask01_and_64_gpr_val(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask01_and_64_gpr_val:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq $-2, %rsi
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock btrq %rcx, (%rdi)
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    rolq %cl, %rsi
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB30_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    andq %rsi, %rcx
-; CHECK-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; CHECK-NEXT:    jne .LBB30_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
 entry:
   %rem = and i64 %c, 63
@@ -1185,26 +1099,14 @@ entry:
 define i64 @atomic_shl1_and_64_gpr_brnz(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_and_64_gpr_brnz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq $-2, %rsi
-; CHECK-NEXT:    rolq %cl, %rsi
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB40_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    andq %rsi, %r8
-; CHECK-NEXT:    lock cmpxchgq %r8, (%rdi)
-; CHECK-NEXT:    jne .LBB40_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    testq %rdx, %rax
-; CHECK-NEXT:    je .LBB40_3
-; CHECK-NEXT:  # %bb.4: # %if.then
-; CHECK-NEXT:    movq (%rdi,%rcx,8), %rax
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    andl $63, %eax
+; CHECK-NEXT:    lock btrq %rax, (%rdi)
+; CHECK-NEXT:    jae .LBB40_1
+; CHECK-NEXT:  # %bb.2: # %if.then
+; CHECK-NEXT:    movq (%rdi,%rsi,8), %rax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB40_3:
+; CHECK-NEXT:  .LBB40_1:
 ; CHECK-NEXT:    movl $123, %eax
 ; CHECK-NEXT:    retq
 entry:
@@ -1316,27 +1218,13 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_small_mask_and_64_gpr_brnz(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_small_mask_and_64_gpr_brnz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    andl $31, %ecx
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq $-2, %rsi
-; CHECK-NEXT:    rolq %cl, %rsi
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB43_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    andq %rsi, %r8
-; CHECK-NEXT:    lock cmpxchgq %r8, (%rdi)
-; CHECK-NEXT:    jne .LBB43_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    testl %edx, %eax
-; CHECK-NEXT:    je .LBB43_3
-; CHECK-NEXT:  # %bb.4: # %if.then
-; CHECK-NEXT:    movq (%rdi,%rcx,8), %rax
+; CHECK-NEXT:    andl $31, %esi
+; CHECK-NEXT:    lock btrq %rsi, (%rdi)
+; CHECK-NEXT:    jae .LBB43_1
+; CHECK-NEXT:  # %bb.2: # %if.then
+; CHECK-NEXT:    movq (%rdi,%rsi,8), %rax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB43_3:
+; CHECK-NEXT:  .LBB43_1:
 ; CHECK-NEXT:    movl $123, %eax
 ; CHECK-NEXT:    retq
 entry:
@@ -1361,24 +1249,14 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_mask0_and_64_gpr_brnz(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask0_and_64_gpr_brnz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movq $-2, %rdx
-; CHECK-NEXT:    rolq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB44_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    andq %rdx, %rsi
-; CHECK-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; CHECK-NEXT:    jne .LBB44_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    btq %rcx, %rax
-; CHECK-NEXT:    jae .LBB44_3
-; CHECK-NEXT:  # %bb.4: # %if.then
-; CHECK-NEXT:    movq (%rdi,%rcx,8), %rax
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    andl $63, %eax
+; CHECK-NEXT:    lock btrq %rax, (%rdi)
+; CHECK-NEXT:    jae .LBB44_1
+; CHECK-NEXT:  # %bb.2: # %if.then
+; CHECK-NEXT:    movq (%rdi,%rsi,8), %rax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB44_3:
+; CHECK-NEXT:  .LBB44_1:
 ; CHECK-NEXT:    movl $123, %eax
 ; CHECK-NEXT:    retq
 entry:
@@ -1404,24 +1282,14 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_mask1_and_64_gpr_brnz(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask1_and_64_gpr_brnz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movq $-2, %rdx
-; CHECK-NEXT:    rolq %cl, %rdx
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB45_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    andq %rdx, %rsi
-; CHECK-NEXT:    lock cmpxchgq %rsi, (%rdi)
-; CHECK-NEXT:    jne .LBB45_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    btq %rcx, %rax
-; CHECK-NEXT:    jae .LBB45_3
-; CHECK-NEXT:  # %bb.4: # %if.then
-; CHECK-NEXT:    movq (%rdi,%rcx,8), %rax
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    andl $63, %eax
+; CHECK-NEXT:    lock btrq %rax, (%rdi)
+; CHECK-NEXT:    jae .LBB45_1
+; CHECK-NEXT:  # %bb.2: # %if.then
+; CHECK-NEXT:    movq (%rdi,%rsi,8), %rax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB45_3:
+; CHECK-NEXT:  .LBB45_1:
 ; CHECK-NEXT:    movl $123, %eax
 ; CHECK-NEXT:    retq
 entry:
@@ -1447,26 +1315,14 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_mask01_and_64_gpr_brnz(ptr %v, i64 %c) nounwind {
 ; CHECK-LABEL: atomic_shl1_mask01_and_64_gpr_brnz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %edx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    movq $-2, %rsi
-; CHECK-NEXT:    rolq %cl, %rsi
-; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB46_1: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    andq %rsi, %r8
-; CHECK-NEXT:    lock cmpxchgq %r8, (%rdi)
-; CHECK-NEXT:    jne .LBB46_1
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    testq %rdx, %rax
-; CHECK-NEXT:    je .LBB46_3
-; CHECK-NEXT:  # %bb.4: # %if.then
-; CHECK-NEXT:    movq (%rdi,%rcx,8), %rax
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    andl $63, %eax
+; CHECK-NEXT:    lock btrq %rax, (%rdi)
+; CHECK-NEXT:    jae .LBB46_1
+; CHECK-NEXT:  # %bb.2: # %if.then
+; CHECK-NEXT:    movq (%rdi,%rsi,8), %rax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB46_3:
+; CHECK-NEXT:  .LBB46_1:
 ; CHECK-NEXT:    movl $123, %eax
 ; CHECK-NEXT:    retq
 entry:

diff  --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index 4770fc1bb449c..b0f4e08f292e4 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -751,49 +751,26 @@ entry:
 define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $7, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
-; X86-NEXT:    movzwl %si, %ecx
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB13_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %si, (%edx)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB13_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $7, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_xor_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $7, %cl
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw %cx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl %dx, %ecx
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB13_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %dx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB13_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -936,47 +913,26 @@ entry:
 define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzwl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB16_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB16_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btcw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btcw %cx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB16_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl %edx, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB16_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -2384,56 +2340,27 @@ entry:
 define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_and_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $7, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movw $-2, %di
-; X86-NEXT:    rolw %cl, %di
-; X86-NEXT:    movzwl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB37_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $7, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB37_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %si, %ecx
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_and_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $7, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movw $-2, %si
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw %cx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rolw %cl, %si
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB37_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl %esi, %ecx
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB37_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movzwl %dx, %ecx
-; X64-NEXT:    andl %eax, %ecx
-; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i16 %c, 7
@@ -2575,55 +2502,26 @@ entry:
 define zeroext i16 @atomic_shl1_mask01_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_and_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movw $-2, %di
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    rolw %cl, %di
-; X86-NEXT:    movzwl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB40_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    jne .LBB40_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btrw %cx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movw $-2, %r8w
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolw %cl, %r8w
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB40_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl %r8d, %ecx
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
-; X64-NEXT:    jne .LBB40_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btrw %cx, (%rdi)
+; X64-NEXT:    setb %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
@@ -3855,40 +3753,25 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB60_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, (%esi)
-; X86-NEXT:    jne .LBB60_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB60_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; X64-NEXT:    jne .LBB60_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -3900,42 +3783,25 @@ entry:
 define i32 @atomic_shl1_small_mask_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB61_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, (%edx)
-; X86-NEXT:    jne .LBB61_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB61_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; X64-NEXT:    jne .LBB61_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 15
@@ -3948,47 +3814,25 @@ entry:
 define i32 @atomic_shl1_mask0_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB62_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB62_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB62_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB62_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 31
@@ -4002,47 +3846,25 @@ entry:
 define i32 @atomic_shl1_mask1_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB63_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB63_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB63_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB63_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -4056,40 +3878,25 @@ entry:
 define i32 @atomic_shl1_mask01_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB64_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, (%esi)
-; X86-NEXT:    jne .LBB64_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    lock btsl %ecx, (%edx)
+; X86-NEXT:    setb %al
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_val:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock btsl %ecx, (%rdi)
+; X64-NEXT:    setb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB64_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
-; X64-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; X64-NEXT:    jne .LBB64_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 31
@@ -4807,54 +4614,30 @@ entry:
 define i32 @atomic_shl1_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB78_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB78_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB78_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB78_5
-; X86-NEXT:  .LBB78_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB78_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB78_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB78_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB78_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB78_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB78_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB78_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB78_3:
+; X64-NEXT:  .LBB78_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -4878,56 +4661,28 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_small_mask_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB79_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB79_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB79_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB79_5
-; X86-NEXT:  .LBB79_3:
+; X86-NEXT:    lock btsl %ecx, (%eax)
+; X86-NEXT:    jae .LBB79_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB79_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB79_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB79_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB79_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB79_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    lock btsl %esi, (%rdi)
+; X64-NEXT:    jae .LBB79_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB79_3:
+; X64-NEXT:  .LBB79_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -4952,54 +4707,30 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask0_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB80_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB80_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB80_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB80_5
-; X86-NEXT:  .LBB80_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB80_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB80_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB80_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB80_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB80_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB80_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB80_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB80_3:
+; X64-NEXT:  .LBB80_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5025,54 +4756,30 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask1_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB81_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB81_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB81_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB81_5
-; X86-NEXT:  .LBB81_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB81_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB81_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB81_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB81_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB81_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB81_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB81_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB81_3:
+; X64-NEXT:  .LBB81_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5098,54 +4805,30 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask01_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB82_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB82_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB82_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB82_5
-; X86-NEXT:  .LBB82_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB82_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB82_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB82_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_br:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB82_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB82_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB82_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB82_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB82_3:
+; X64-NEXT:  .LBB82_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5243,56 +4926,31 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB84_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    lock cmpxchgl %edx, (%esi)
-; X86-NEXT:    jne .LBB84_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    testl %edi, %eax
-; X86-NEXT:    jne .LBB84_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB84_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB84_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB84_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB84_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    orl %esi, %edx
-; X64-NEXT:    lock cmpxchgl %edx, (%rdi)
-; X64-NEXT:    jne .LBB84_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    testl %esi, %eax
-; X64-NEXT:    je .LBB84_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB84_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB84_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB84_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -5315,58 +4973,29 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_small_mask_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB85_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    lock cmpxchgl %edx, (%esi)
-; X86-NEXT:    jne .LBB85_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    testl %edi, %eax
-; X86-NEXT:    jne .LBB85_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB85_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $15, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB85_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB85_1: # %if.then
+; X86-NEXT:    movl (%ecx,%edx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movl $1, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB85_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    orl %esi, %edx
-; X64-NEXT:    lock cmpxchgl %edx, (%rdi)
-; X64-NEXT:    jne .LBB85_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    testl %esi, %eax
-; X64-NEXT:    je .LBB85_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    lock btsl %esi, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB85_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB85_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB85_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %0 = and i32 %c, 15
@@ -5390,56 +5019,31 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask0_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB86_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%esi)
-; X86-NEXT:    jne .LBB86_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jb .LBB86_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB86_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB86_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB86_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB86_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB86_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB86_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB86_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB86_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB86_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %rem = and i32 %c, 31
@@ -5464,56 +5068,31 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask1_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB87_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%esi)
-; X86-NEXT:    jne .LBB87_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jb .LBB87_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB87_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB87_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB87_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB87_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB87_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB87_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB87_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB87_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB87_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %shl = shl nuw i32 1, %c
@@ -5538,56 +5117,31 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask01_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB88_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    lock cmpxchgl %edx, (%esi)
-; X86-NEXT:    jne .LBB88_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movl $123, %edx
-; X86-NEXT:    testl %edi, %eax
-; X86-NEXT:    jne .LBB88_4
-; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movl (%esi,%ecx,4), %edx
-; X86-NEXT:  .LBB88_4: # %return
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $31, %eax
+; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    jae .LBB88_1
+; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB88_1: # %if.then
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_brz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB88_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    orl %esi, %edx
-; X64-NEXT:    lock cmpxchgl %edx, (%rdi)
-; X64-NEXT:    jne .LBB88_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    movl $123, %edx
-; X64-NEXT:    testl %esi, %eax
-; X64-NEXT:    je .LBB88_3
-; X64-NEXT:  # %bb.4: # %return
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    movl $123, %eax
+; X64-NEXT:    jae .LBB88_1
+; X64-NEXT:  # %bb.2: # %return
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB88_3: # %if.then
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %edx
-; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB88_1: # %if.then
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %rem = and i32 %c, 31
@@ -5685,54 +5239,30 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB90_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB90_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB90_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB90_5
-; X86-NEXT:  .LBB90_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB90_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB90_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB90_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB90_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB90_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB90_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB90_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB90_3:
+; X64-NEXT:  .LBB90_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5756,56 +5286,28 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_small_mask_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB91_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB91_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB91_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB91_5
-; X86-NEXT:  .LBB91_3:
+; X86-NEXT:    lock btsl %ecx, (%eax)
+; X86-NEXT:    jae .LBB91_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB91_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB91_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB91_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB91_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB91_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    andl $15, %esi
+; X64-NEXT:    lock btsl %esi, (%rdi)
+; X64-NEXT:    jae .LBB91_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB91_3:
+; X64-NEXT:  .LBB91_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5830,54 +5332,30 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask0_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB92_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB92_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB92_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB92_5
-; X86-NEXT:  .LBB92_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB92_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB92_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB92_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB92_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB92_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB92_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB92_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB92_3:
+; X64-NEXT:  .LBB92_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5903,54 +5381,30 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB93_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB93_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB93_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB93_5
-; X86-NEXT:  .LBB93_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB93_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB93_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB93_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB93_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB93_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    btl %ecx, %eax
-; X64-NEXT:    jae .LBB93_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB93_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB93_3:
+; X64-NEXT:  .LBB93_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry:
@@ -5976,54 +5430,30 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask01_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB94_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
-; X86-NEXT:    jne .LBB94_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
-; X86-NEXT:    je .LBB94_3
-; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
-; X86-NEXT:    jmp .LBB94_5
-; X86-NEXT:  .LBB94_3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    jae .LBB94_1
+; X86-NEXT:  # %bb.2: # %if.then
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB94_1:
 ; X86-NEXT:    movl $123, %eax
-; X86-NEXT:  .LBB94_5: # %return
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_brnz:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB94_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    orl %edx, %esi
-; X64-NEXT:    lock cmpxchgl %esi, (%rdi)
-; X64-NEXT:    jne .LBB94_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-NEXT:    testl %edx, %eax
-; X64-NEXT:    je .LBB94_3
-; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %eax
+; X64-NEXT:    lock btsl %eax, (%rdi)
+; X64-NEXT:    jae .LBB94_1
+; X64-NEXT:  # %bb.2: # %if.then
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB94_3:
+; X64-NEXT:  .LBB94_1:
 ; X64-NEXT:    movl $123, %eax
 ; X64-NEXT:    retq
 entry: