[llvm-commits] [llvm] r164281 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h lib/Target/X86/X86InstrCompiler.td lib/Target/X86/X86InstrInfo.cpp lib/Target/X86/X86InstrInfo.h test/CodeGen/X86/2010-01-08-Atomic64Bug.ll test/CodeGen/X86/atomic16.ll test/CodeGen/X86/atomic32.ll test/CodeGen/X86/atomic64.ll test/CodeGen/X86/atomic6432.ll test/CodeGen/X86/atomic8.ll test/CodeGen/X86/atomic_op.ll test/CodeGen/X86/pr13458.ll

Cameron Zwarich zwarich at apple.com
Sun Feb 24 15:24:03 PST 2013


I guess it is a bit late to say this now, but this commit has a lot of problems. It sticks store memory operands onto loads, copies kill flags from one use to multiple uses, and uses a physical register across basic blocks prior to register allocation. I have a patch ready for the first two, and I'll probably fix the last one and commit it.

Cameron

On Sep 19, 2012, at 8:06 PM, Michael Liao <michael.liao at intel.com> wrote:

> Author: hliao
> Date: Wed Sep 19 22:06:15 2012
> New Revision: 164281
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=164281&view=rev
> Log:
> Re-work X86 code generation of atomic ops with spin-loop
> 
> - Rewrite/merge pseudo-atomic instruction emitters to address the
>  following issue:
>  * Reduce one unnecessary load in spin-loop
> 
>    previously the spin-loop looks like
> 
>        thisMBB:
>        newMBB:
>          ld  t1 = [bitinstr.addr]
>          op  t2 = t1, [bitinstr.val]
>          not t3 = t2  (if Invert)
>          mov EAX = t1
>          lcs dest = [bitinstr.addr], t3  [EAX is implicit]
>          bz  newMBB
>          fallthrough -->nextMBB
> 
>    the 'ld' at the beginning of newMBB should be lift out of the loop
>    as lcs (or CMPXCHG on x86) will load the current memory value into
>    EAX. This loop is refined as:
> 
>        thisMBB:
>          EAX = LOAD [MI.addr]
>        mainMBB:
>          t1 = OP [MI.val], EAX
>          LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
>          JNE mainMBB
>        sinkMBB:
> 
>  * Remove immopc as, so far, all pseudo-atomic instructions has
>    all-register form only, there is no immedidate operand.
> 
>  * Remove unnecessary attributes/modifiers in pseudo-atomic instruction
>    td
> 
>  * Fix issues in PR13458
> 
> - Add comprehensive tests on atomic ops on various data types.
>  NOTE: Some of them are turned off due to missing functionality.
> 
> - Revise tests due to the new spin-loop generated.
> 
> 
> Added:
>    llvm/trunk/test/CodeGen/X86/atomic16.ll
>    llvm/trunk/test/CodeGen/X86/atomic32.ll
>    llvm/trunk/test/CodeGen/X86/atomic64.ll
>    llvm/trunk/test/CodeGen/X86/atomic6432.ll
>    llvm/trunk/test/CodeGen/X86/atomic8.ll
>    llvm/trunk/test/CodeGen/X86/pr13458.ll
> Modified:
>    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>    llvm/trunk/lib/Target/X86/X86ISelLowering.h
>    llvm/trunk/lib/Target/X86/X86InstrCompiler.td
>    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
>    llvm/trunk/lib/Target/X86/X86InstrInfo.h
>    llvm/trunk/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
>    llvm/trunk/test/CodeGen/X86/atomic_op.ll
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=164281&r1=164280&r2=164281&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Sep 19 22:06:15 2012
> @@ -11911,385 +11911,498 @@
> //===----------------------------------------------------------------------===//
> 
> // private utility function
> +
> +// Get CMPXCHG opcode for the specified data type.
> +static unsigned getCmpXChgOpcode(EVT VT) {
> +  switch (VT.getSimpleVT().SimpleTy) {
> +  case MVT::i8:  return X86::LCMPXCHG8;
> +  case MVT::i16: return X86::LCMPXCHG16;
> +  case MVT::i32: return X86::LCMPXCHG32;
> +  case MVT::i64: return X86::LCMPXCHG64;
> +  default:
> +    break;
> +  }
> +  llvm_unreachable("Invalid operand size!");
> +}
> +
> +// Get LOAD opcode for the specified data type.
> +static unsigned getLoadOpcode(EVT VT) {
> +  switch (VT.getSimpleVT().SimpleTy) {
> +  case MVT::i8:  return X86::MOV8rm;
> +  case MVT::i16: return X86::MOV16rm;
> +  case MVT::i32: return X86::MOV32rm;
> +  case MVT::i64: return X86::MOV64rm;
> +  default:
> +    break;
> +  }
> +  llvm_unreachable("Invalid operand size!");
> +}
> +
> +// Get opcode of the non-atomic one from the specified atomic instruction.
> +static unsigned getNonAtomicOpcode(unsigned Opc) {
> +  switch (Opc) {
> +  case X86::ATOMAND8:  return X86::AND8rr;
> +  case X86::ATOMAND16: return X86::AND16rr;
> +  case X86::ATOMAND32: return X86::AND32rr;
> +  case X86::ATOMAND64: return X86::AND64rr;
> +  case X86::ATOMOR8:   return X86::OR8rr;
> +  case X86::ATOMOR16:  return X86::OR16rr;
> +  case X86::ATOMOR32:  return X86::OR32rr;
> +  case X86::ATOMOR64:  return X86::OR64rr;
> +  case X86::ATOMXOR8:  return X86::XOR8rr;
> +  case X86::ATOMXOR16: return X86::XOR16rr;
> +  case X86::ATOMXOR32: return X86::XOR32rr;
> +  case X86::ATOMXOR64: return X86::XOR64rr;
> +  }
> +  llvm_unreachable("Unhandled atomic-load-op opcode!");
> +}
> +
> +// Get opcode of the non-atomic one from the specified atomic instruction with
> +// extra opcode.
> +static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
> +                                               unsigned &ExtraOpc) {
> +  switch (Opc) {
> +  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
> +  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
> +  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
> +  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
> +  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
> +  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
> +  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
> +  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
> +  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
> +  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
> +  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
> +  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
> +  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
> +  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
> +  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
> +  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
> +  }
> +  llvm_unreachable("Unhandled atomic-load-op opcode!");
> +}
> +
> +// Get opcode of the non-atomic one from the specified atomic instruction for
> +// 64-bit data type on 32-bit target.
> +static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
> +  switch (Opc) {
> +  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
> +  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
> +  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
> +  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
> +  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
> +  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
> +  }
> +  llvm_unreachable("Unhandled atomic-load-op opcode!");
> +}
> +
> +// Get opcode of the non-atomic one from the specified atomic instruction for
> +// 64-bit data type on 32-bit target with extra opcode.
> +static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
> +                                                   unsigned &HiOpc,
> +                                                   unsigned &ExtraOpc) {
> +  switch (Opc) {
> +  case X86::ATOMNAND6432:
> +    ExtraOpc = X86::NOT32r;
> +    HiOpc = X86::AND32rr;
> +    return X86::AND32rr;
> +  }
> +  llvm_unreachable("Unhandled atomic-load-op opcode!");
> +}
> +
> +// Get pseudo CMOV opcode from the specified data type.
> +static unsigned getPseudoCMOVOpc(EVT VT) {
> +  switch (VT.getSimpleVT().SimpleTy) {
> +  case MVT::i16: return X86::CMOV_GR16;
> +  case MVT::i32: return X86::CMOV_GR32;
> +  default:
> +    break;
> +  }
> +  llvm_unreachable("Unknown CMOV opcode!");
> +}
> +
> +// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
> +// They will be translated into a spin-loop or compare-exchange loop from
> +//
> +//    ...
> +//    dst = atomic-fetch-op MI.addr, MI.val
> +//    ...
> +//
> +// to
> +//
> +//    ...
> +//    EAX = LOAD MI.addr
> +// loop:
> +//    t1 = OP MI.val, EAX
> +//    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
> +//    JNE loop
> +// sink:
> +//    dst = EAX
> +//    ...
> MachineBasicBlock *
> -X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
> -                                                       MachineBasicBlock *MBB,
> -                                                       unsigned regOpc,
> -                                                       unsigned immOpc,
> -                                                       unsigned LoadOpc,
> -                                                       unsigned CXchgOpc,
> -                                                       unsigned notOpc,
> -                                                       unsigned EAXreg,
> -                                                 const TargetRegisterClass *RC,
> -                                                       bool Invert) const {
> -  // For the atomic bitwise operator, we generate
> -  //   thisMBB:
> -  //   newMBB:
> -  //     ld  t1 = [bitinstr.addr]
> -  //     op  t2 = t1, [bitinstr.val]
> -  //     not t3 = t2  (if Invert)
> -  //     mov EAX = t1
> -  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
> -  //     bz  newMBB
> -  //     fallthrough -->nextMBB
> +X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
> +                                       MachineBasicBlock *MBB) const {
>   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
> -  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
> -  MachineFunction::iterator MBBIter = MBB;
> -  ++MBBIter;
> +  DebugLoc DL = MI->getDebugLoc();
> 
> -  /// First build the CFG
> -  MachineFunction *F = MBB->getParent();
> -  MachineBasicBlock *thisMBB = MBB;
> -  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
> -  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
> -  F->insert(MBBIter, newMBB);
> -  F->insert(MBBIter, nextMBB);
> -
> -  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
> -  nextMBB->splice(nextMBB->begin(), thisMBB,
> -                  llvm::next(MachineBasicBlock::iterator(bInstr)),
> -                  thisMBB->end());
> -  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
> -
> -  // Update thisMBB to fall through to newMBB
> -  thisMBB->addSuccessor(newMBB);
> -
> -  // newMBB jumps to itself and fall through to nextMBB
> -  newMBB->addSuccessor(nextMBB);
> -  newMBB->addSuccessor(newMBB);
> -
> -  // Insert instructions into newMBB based on incoming instruction
> -  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
> -         "unexpected number of operands");
> -  DebugLoc dl = bInstr->getDebugLoc();
> -  MachineOperand& destOper = bInstr->getOperand(0);
> -  MachineOperand* argOpers[2 + X86::AddrNumOperands];
> -  int numArgs = bInstr->getNumOperands() - 1;
> -  for (int i=0; i < numArgs; ++i)
> -    argOpers[i] = &bInstr->getOperand(i+1);
> -
> -  // x86 address has 4 operands: base, index, scale, and displacement
> -  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
> -  int valArgIndx = lastAddrIndx + 1;
> -
> -  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
> -  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
> -  for (int i=0; i <= lastAddrIndx; ++i)
> -    (*MIB).addOperand(*argOpers[i]);
> -
> -  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
> -  assert((argOpers[valArgIndx]->isReg() ||
> -          argOpers[valArgIndx]->isImm()) &&
> -         "invalid operand");
> -  if (argOpers[valArgIndx]->isReg())
> -    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
> -  else
> -    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
> -  MIB.addReg(t1);
> -  (*MIB).addOperand(*argOpers[valArgIndx]);
> +  MachineFunction *MF = MBB->getParent();
> +  MachineRegisterInfo &MRI = MF->getRegInfo();
> 
> -  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
> -  if (Invert) {
> -    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
> -  }
> -  else
> -    t3 = t2;
> +  const BasicBlock *BB = MBB->getBasicBlock();
> +  MachineFunction::iterator I = MBB;
> +  ++I;
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
> -  MIB.addReg(t1);
> +  assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
> +         "Unexpected number of operands");
> +
> +  assert(MI->hasOneMemOperand() &&
> +         "Expected atomic-load-op to have one memoperand");
> +
> +  // Memory Reference
> +  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
> +  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
> -  for (int i=0; i <= lastAddrIndx; ++i)
> -    (*MIB).addOperand(*argOpers[i]);
> -  MIB.addReg(t3);
> -  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
> -  (*MIB).setMemRefs(bInstr->memoperands_begin(),
> -                    bInstr->memoperands_end());
> +  unsigned DstReg, SrcReg;
> +  unsigned MemOpndSlot;
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
> -  MIB.addReg(EAXreg);
> +  unsigned CurOp = 0;
> 
> -  // insert branch
> -  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
> +  DstReg = MI->getOperand(CurOp++).getReg();
> +  MemOpndSlot = CurOp;
> +  CurOp += X86::AddrNumOperands;
> +  SrcReg = MI->getOperand(CurOp++).getReg();
> 
> -  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
> -  return nextMBB;
> -}
> +  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
> +  EVT VT = *RC->vt_begin();
> +  unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
> 
> -// private utility function:  64 bit atomics on 32 bit host.
> -MachineBasicBlock *
> -X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
> -                                                       MachineBasicBlock *MBB,
> -                                                       unsigned regOpcL,
> -                                                       unsigned regOpcH,
> -                                                       unsigned immOpcL,
> -                                                       unsigned immOpcH,
> -                                                       bool Invert) const {
> -  // For the atomic bitwise operator, we generate
> -  //   thisMBB (instructions are in pairs, except cmpxchg8b)
> -  //     ld t1,t2 = [bitinstr.addr]
> -  //   newMBB:
> -  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
> -  //     op  t5, t6 <- out1, out2, [bitinstr.val]
> -  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
> -  //     neg t7, t8 < t5, t6  (if Invert)
> -  //     mov ECX, EBX <- t5, t6
> -  //     mov EAX, EDX <- t1, t2
> -  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
> -  //     mov t3, t4 <- EAX, EDX
> -  //     bz  newMBB
> -  //     result in out1, out2
> -  //     fallthrough -->nextMBB
> +  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
> +  unsigned LOADOpc = getLoadOpcode(VT);
> 
> -  const TargetRegisterClass *RC = &X86::GR32RegClass;
> -  const unsigned LoadOpc = X86::MOV32rm;
> -  const unsigned NotOpc = X86::NOT32r;
> -  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
> -  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
> -  MachineFunction::iterator MBBIter = MBB;
> -  ++MBBIter;
> +  // For the atomic load-arith operator, we generate
> +  //
> +  //  thisMBB:
> +  //    EAX = LOAD [MI.addr]
> +  //  mainMBB:
> +  //    t1 = OP MI.val, EAX
> +  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
> +  //    JNE mainMBB
> +  //  sinkMBB:
> 
> -  /// First build the CFG
> -  MachineFunction *F = MBB->getParent();
>   MachineBasicBlock *thisMBB = MBB;
> -  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
> -  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
> -  F->insert(MBBIter, newMBB);
> -  F->insert(MBBIter, nextMBB);
> -
> -  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
> -  nextMBB->splice(nextMBB->begin(), thisMBB,
> -                  llvm::next(MachineBasicBlock::iterator(bInstr)),
> -                  thisMBB->end());
> -  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
> -
> -  // Update thisMBB to fall through to newMBB
> -  thisMBB->addSuccessor(newMBB);
> -
> -  // newMBB jumps to itself and fall through to nextMBB
> -  newMBB->addSuccessor(nextMBB);
> -  newMBB->addSuccessor(newMBB);
> -
> -  DebugLoc dl = bInstr->getDebugLoc();
> -  // Insert instructions into newMBB based on incoming instruction
> -  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
> -  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
> -         "unexpected number of operands");
> -  MachineOperand& dest1Oper = bInstr->getOperand(0);
> -  MachineOperand& dest2Oper = bInstr->getOperand(1);
> -  MachineOperand* argOpers[2 + X86::AddrNumOperands];
> -  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
> -    argOpers[i] = &bInstr->getOperand(i+2);
> -
> -    // We use some of the operands multiple times, so conservatively just
> -    // clear any kill flags that might be present.
> -    if (argOpers[i]->isReg() && argOpers[i]->isUse())
> -      argOpers[i]->setIsKill(false);
> -  }
> -
> -  // x86 address has 5 operands: base, index, scale, displacement, and segment.
> -  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
> -
> -  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
> -  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
> -  for (int i=0; i <= lastAddrIndx; ++i)
> -    (*MIB).addOperand(*argOpers[i]);
> -  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
> -  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
> -  // add 4 to displacement.
> -  for (int i=0; i <= lastAddrIndx-2; ++i)
> -    (*MIB).addOperand(*argOpers[i]);
> -  MachineOperand newOp3 = *(argOpers[3]);
> -  if (newOp3.isImm())
> -    newOp3.setImm(newOp3.getImm()+4);
> -  else
> -    newOp3.setOffset(newOp3.getOffset()+4);
> -  (*MIB).addOperand(newOp3);
> -  (*MIB).addOperand(*argOpers[lastAddrIndx]);
> -
> -  // t3/4 are defined later, at the bottom of the loop
> -  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
> -  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
> -  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
> -    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
> -  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
> -    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
> -
> -  // The subsequent operations should be using the destination registers of
> -  // the PHI instructions.
> -  t1 = dest1Oper.getReg();
> -  t2 = dest2Oper.getReg();
> -
> -  int valArgIndx = lastAddrIndx + 1;
> -  assert((argOpers[valArgIndx]->isReg() ||
> -          argOpers[valArgIndx]->isImm()) &&
> -         "invalid operand");
> -  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
> -  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
> -  if (argOpers[valArgIndx]->isReg())
> -    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
> -  else
> -    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
> -  if (regOpcL != X86::MOV32rr)
> -    MIB.addReg(t1);
> -  (*MIB).addOperand(*argOpers[valArgIndx]);
> -  assert(argOpers[valArgIndx + 1]->isReg() ==
> -         argOpers[valArgIndx]->isReg());
> -  assert(argOpers[valArgIndx + 1]->isImm() ==
> -         argOpers[valArgIndx]->isImm());
> -  if (argOpers[valArgIndx + 1]->isReg())
> -    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
> -  else
> -    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
> -  if (regOpcH != X86::MOV32rr)
> -    MIB.addReg(t2);
> -  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
> -
> -  unsigned t7, t8;
> -  if (Invert) {
> -    t7 = F->getRegInfo().createVirtualRegister(RC);
> -    t8 = F->getRegInfo().createVirtualRegister(RC);
> -    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
> -    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
> -  } else {
> -    t7 = t5;
> -    t8 = t6;
> +  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
> +  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
> +  MF->insert(I, mainMBB);
> +  MF->insert(I, sinkMBB);
> +
> +  MachineInstrBuilder MIB;
> +
> +  // Transfer the remainder of BB and its successor edges to sinkMBB.
> +  sinkMBB->splice(sinkMBB->begin(), MBB,
> +                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
> +  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
> +
> +  // thisMBB:
> +  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
> +  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
> +    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
> +  MIB.setMemRefs(MMOBegin, MMOEnd);
> +
> +  thisMBB->addSuccessor(mainMBB);
> +
> +  // mainMBB:
> +  MachineBasicBlock *origMainMBB = mainMBB;
> +  mainMBB->addLiveIn(AccPhyReg);
> +
> +  // Copy AccPhyReg as it is used more than once.
> +  unsigned AccReg = MRI.createVirtualRegister(RC);
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
> +    .addReg(AccPhyReg);
> +
> +  unsigned t1 = MRI.createVirtualRegister(RC);
> +  unsigned Opc = MI->getOpcode();
> +  switch (Opc) {
> +  default:
> +    llvm_unreachable("Unhandled atomic-load-op opcode!");
> +  case X86::ATOMAND8:
> +  case X86::ATOMAND16:
> +  case X86::ATOMAND32:
> +  case X86::ATOMAND64:
> +  case X86::ATOMOR8:
> +  case X86::ATOMOR16:
> +  case X86::ATOMOR32:
> +  case X86::ATOMOR64:
> +  case X86::ATOMXOR8:
> +  case X86::ATOMXOR16:
> +  case X86::ATOMXOR32:
> +  case X86::ATOMXOR64: {
> +    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
> +    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
> +      .addReg(AccReg);
> +    break;
> +  }
> +  case X86::ATOMNAND8:
> +  case X86::ATOMNAND16:
> +  case X86::ATOMNAND32:
> +  case X86::ATOMNAND64: {
> +    unsigned t2 = MRI.createVirtualRegister(RC);
> +    unsigned NOTOpc;
> +    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
> +    BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
> +      .addReg(AccReg);
> +    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
> +    break;
> +  }
> +  case X86::ATOMMAX16:
> +  case X86::ATOMMAX32:
> +  case X86::ATOMMAX64:
> +  case X86::ATOMMIN16:
> +  case X86::ATOMMIN32:
> +  case X86::ATOMMIN64:
> +  case X86::ATOMUMAX16:
> +  case X86::ATOMUMAX32:
> +  case X86::ATOMUMAX64:
> +  case X86::ATOMUMIN16:
> +  case X86::ATOMUMIN32:
> +  case X86::ATOMUMIN64: {
> +    unsigned CMPOpc;
> +    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
> +
> +    BuildMI(mainMBB, DL, TII->get(CMPOpc))
> +      .addReg(SrcReg)
> +      .addReg(AccReg);
> +
> +    if (Subtarget->hasCMov()) {
> +      // Native support
> +      BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
> +        .addReg(SrcReg)
> +        .addReg(AccReg);
> +    } else {
> +      // Use pseudo select and lower them.
> +      assert((VT == MVT::i16 || VT == MVT::i32) &&
> +             "Invalid atomic-load-op transformation!");
> +      unsigned SelOpc = getPseudoCMOVOpc(VT);
> +      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
> +      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
> +      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
> +              .addReg(SrcReg).addReg(AccReg)
> +              .addImm(CC);
> +      mainMBB = EmitLoweredSelect(MIB, mainMBB);
> +    }
> +    break;
> +  }
>   }
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
> +  // Copy AccPhyReg back from virtual register.
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
> +    .addReg(AccReg);
> +
> +  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
> +  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
> +    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
>   MIB.addReg(t1);
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
> -  MIB.addReg(t2);
> +  MIB.setMemRefs(MMOBegin, MMOEnd);
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
> -  MIB.addReg(t7);
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
> -  MIB.addReg(t8);
> -
> -  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
> -  for (int i=0; i <= lastAddrIndx; ++i)
> -    (*MIB).addOperand(*argOpers[i]);
> -
> -  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
> -  (*MIB).setMemRefs(bInstr->memoperands_begin(),
> -                    bInstr->memoperands_end());
> -
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
> -  MIB.addReg(X86::EAX);
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
> -  MIB.addReg(X86::EDX);
> +  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
> 
> -  // insert branch
> -  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
> +  mainMBB->addSuccessor(origMainMBB);
> +  mainMBB->addSuccessor(sinkMBB);
> +
> +  // sinkMBB:
> +  sinkMBB->addLiveIn(AccPhyReg);
> +
> +  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
> +          TII->get(TargetOpcode::COPY), DstReg)
> +    .addReg(AccPhyReg);
> 
> -  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
> -  return nextMBB;
> +  MI->eraseFromParent();
> +  return sinkMBB;
> }
> 
> -// private utility function
> +// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
> +// instructions. They will be translated into a spin-loop or compare-exchange
> +// loop from
> +//
> +//    ...
> +//    dst = atomic-fetch-op MI.addr, MI.val
> +//    ...
> +//
> +// to
> +//
> +//    ...
> +//    EAX = LOAD [MI.addr + 0]
> +//    EDX = LOAD [MI.addr + 4]
> +// loop:
> +//    EBX = OP MI.val.lo, EAX
> +//    ECX = OP MI.val.hi, EDX
> +//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
> +//    JNE loop
> +// sink:
> +//    dst = EDX:EAX
> +//    ...
> MachineBasicBlock *
> -X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
> -                                                      MachineBasicBlock *MBB,
> -                                                      unsigned cmovOpc) const {
> -  // For the atomic min/max operator, we generate
> -  //   thisMBB:
> -  //   newMBB:
> -  //     ld t1 = [min/max.addr]
> -  //     mov t2 = [min/max.val]
> -  //     cmp  t1, t2
> -  //     cmov[cond] t2 = t1
> -  //     mov EAX = t1
> -  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
> -  //     bz   newMBB
> -  //     fallthrough -->nextMBB
> -  //
> +X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
> +                                           MachineBasicBlock *MBB) const {
>   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
> -  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
> -  MachineFunction::iterator MBBIter = MBB;
> -  ++MBBIter;
> +  DebugLoc DL = MI->getDebugLoc();
> +
> +  MachineFunction *MF = MBB->getParent();
> +  MachineRegisterInfo &MRI = MF->getRegInfo();
> +
> +  const BasicBlock *BB = MBB->getBasicBlock();
> +  MachineFunction::iterator I = MBB;
> +  ++I;
> +
> +  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
> +         "Unexpected number of operands");
> +
> +  assert(MI->hasOneMemOperand() &&
> +         "Expected atomic-load-op32 to have one memoperand");
> +
> +  // Memory Reference
> +  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
> +  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
> +
> +  unsigned DstLoReg, DstHiReg;
> +  unsigned SrcLoReg, SrcHiReg;
> +  unsigned MemOpndSlot;
> +
> +  unsigned CurOp = 0;
> +
> +  DstLoReg = MI->getOperand(CurOp++).getReg();
> +  DstHiReg = MI->getOperand(CurOp++).getReg();
> +  MemOpndSlot = CurOp;
> +  CurOp += X86::AddrNumOperands;
> +  SrcLoReg = MI->getOperand(CurOp++).getReg();
> +  SrcHiReg = MI->getOperand(CurOp++).getReg();
> +
> +  const TargetRegisterClass *RC = &X86::GR32RegClass;
> +
> +  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
> +  unsigned LOADOpc = X86::MOV32rm;
> +
> +  // For the atomic load-arith operator, we generate
> +  //
> +  //  thisMBB:
> +  //    EAX = LOAD [MI.addr + 0]
> +  //    EDX = LOAD [MI.addr + 4]
> +  //  mainMBB:
> +  //    EBX = OP MI.vallo, EAX
> +  //    ECX = OP MI.valhi, EDX
> +  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
> +  //    JNE mainMBB
> +  //  sinkMBB:
> 
> -  /// First build the CFG
> -  MachineFunction *F = MBB->getParent();
>   MachineBasicBlock *thisMBB = MBB;
> -  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
> -  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
> -  F->insert(MBBIter, newMBB);
> -  F->insert(MBBIter, nextMBB);
> -
> -  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
> -  nextMBB->splice(nextMBB->begin(), thisMBB,
> -                  llvm::next(MachineBasicBlock::iterator(mInstr)),
> -                  thisMBB->end());
> -  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
> -
> -  // Update thisMBB to fall through to newMBB
> -  thisMBB->addSuccessor(newMBB);
> -
> -  // newMBB jumps to newMBB and fall through to nextMBB
> -  newMBB->addSuccessor(nextMBB);
> -  newMBB->addSuccessor(newMBB);
> -
> -  DebugLoc dl = mInstr->getDebugLoc();
> -  // Insert instructions into newMBB based on incoming instruction
> -  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
> -         "unexpected number of operands");
> -  MachineOperand& destOper = mInstr->getOperand(0);
> -  MachineOperand* argOpers[2 + X86::AddrNumOperands];
> -  int numArgs = mInstr->getNumOperands() - 1;
> -  for (int i=0; i < numArgs; ++i)
> -    argOpers[i] = &mInstr->getOperand(i+1);
> -
> -  // x86 address has 4 operands: base, index, scale, and displacement
> -  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
> -  int valArgIndx = lastAddrIndx + 1;
> -
> -  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
> -  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
> -  for (int i=0; i <= lastAddrIndx; ++i)
> -    (*MIB).addOperand(*argOpers[i]);
> -
> -  // We only support register and immediate values
> -  assert((argOpers[valArgIndx]->isReg() ||
> -          argOpers[valArgIndx]->isImm()) &&
> -         "invalid operand");
> -
> -  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
> -  if (argOpers[valArgIndx]->isReg())
> -    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
> -  else
> -    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
> -  (*MIB).addOperand(*argOpers[valArgIndx]);
> +  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
> +  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
> +  MF->insert(I, mainMBB);
> +  MF->insert(I, sinkMBB);
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
> -  MIB.addReg(t1);
> +  MachineInstrBuilder MIB;
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
> -  MIB.addReg(t1);
> -  MIB.addReg(t2);
> +  // Transfer the remainder of BB and its successor edges to sinkMBB.
> +  sinkMBB->splice(sinkMBB->begin(), MBB,
> +                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
> +  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
> +
> +  // thisMBB:
> +  // Lo
> +  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
> +  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
> +    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
> +  MIB.setMemRefs(MMOBegin, MMOEnd);
> +  // Hi
> +  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
> +  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
> +    if (i == X86::AddrDisp)
> +      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
> +    else
> +      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
> +  }
> +  MIB.setMemRefs(MMOBegin, MMOEnd);
> 
> -  // Generate movc
> -  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
> -  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
> -  MIB.addReg(t2);
> -  MIB.addReg(t1);
> +  thisMBB->addSuccessor(mainMBB);
> 
> -  // Cmp and exchange if none has modified the memory location
> -  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
> -  for (int i=0; i <= lastAddrIndx; ++i)
> -    (*MIB).addOperand(*argOpers[i]);
> -  MIB.addReg(t3);
> -  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
> -  (*MIB).setMemRefs(mInstr->memoperands_begin(),
> -                    mInstr->memoperands_end());
> +  // mainMBB:
> +  MachineBasicBlock *origMainMBB = mainMBB;
> +  mainMBB->addLiveIn(X86::EAX);
> +  mainMBB->addLiveIn(X86::EDX);
> 
> -  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
> -  MIB.addReg(X86::EAX);
> +  // Copy EDX:EAX as they are used more than once.
> +  unsigned LoReg = MRI.createVirtualRegister(RC);
> +  unsigned HiReg = MRI.createVirtualRegister(RC);
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
> 
> -  // insert branch
> -  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
> +  unsigned t1L = MRI.createVirtualRegister(RC);
> +  unsigned t1H = MRI.createVirtualRegister(RC);
> 
> -  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
> -  return nextMBB;
> +  unsigned Opc = MI->getOpcode();
> +  switch (Opc) {
> +  default:
> +    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
> +  case X86::ATOMAND6432:
> +  case X86::ATOMOR6432:
> +  case X86::ATOMXOR6432:
> +  case X86::ATOMADD6432:
> +  case X86::ATOMSUB6432: {
> +    unsigned HiOpc;
> +    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
> +    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg).addReg(LoReg);
> +    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg).addReg(HiReg);
> +    break;
> +  }
> +  case X86::ATOMNAND6432: {
> +    unsigned HiOpc, NOTOpc;
> +    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
> +    unsigned t2L = MRI.createVirtualRegister(RC);
> +    unsigned t2H = MRI.createVirtualRegister(RC);
> +    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
> +    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
> +    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
> +    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
> +    break;
> +  }
> +  case X86::ATOMSWAP6432: {
> +    unsigned HiOpc;
> +    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
> +    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
> +    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
> +    break;
> +  }
> +  }
> +
> +  // Copy EDX:EAX back from HiReg:LoReg
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
> +  // Copy ECX:EBX from t1H:t1L
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
> +  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
> +
> +  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
> +  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
> +    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
> +  MIB.setMemRefs(MMOBegin, MMOEnd);
> +
> +  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
> +
> +  mainMBB->addSuccessor(origMainMBB);
> +  mainMBB->addSuccessor(sinkMBB);
> +
> +  // sinkMBB:
> +  sinkMBB->addLiveIn(X86::EAX);
> +  sinkMBB->addLiveIn(X86::EDX);
> +
> +  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
> +          TII->get(TargetOpcode::COPY), DstLoReg)
> +    .addReg(X86::EAX);
> +  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
> +          TII->get(TargetOpcode::COPY), DstHiReg)
> +    .addReg(X86::EDX);
> +
> +  MI->eraseFromParent();
> +  return sinkMBB;
> }
> 
> // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
> @@ -13176,130 +13289,42 @@
>     return EmitMonitor(MI, BB);
> 
>     // Atomic Lowering.
> -  case X86::ATOMMIN32:
> -  case X86::ATOMMAX32:
> -  case X86::ATOMUMIN32:
> -  case X86::ATOMUMAX32:
> -  case X86::ATOMMIN16:
> -  case X86::ATOMMAX16:
> -  case X86::ATOMUMIN16:
> -  case X86::ATOMUMAX16:
> -  case X86::ATOMMIN64:
> -  case X86::ATOMMAX64:
> -  case X86::ATOMUMIN64:
> -  case X86::ATOMUMAX64: {
> -    unsigned Opc;
> -    switch (MI->getOpcode()) {
> -    default: llvm_unreachable("illegal opcode!");
> -    case X86::ATOMMIN32:  Opc = X86::CMOVL32rr; break;
> -    case X86::ATOMMAX32:  Opc = X86::CMOVG32rr; break;
> -    case X86::ATOMUMIN32: Opc = X86::CMOVB32rr; break;
> -    case X86::ATOMUMAX32: Opc = X86::CMOVA32rr; break;
> -    case X86::ATOMMIN16:  Opc = X86::CMOVL16rr; break;
> -    case X86::ATOMMAX16:  Opc = X86::CMOVG16rr; break;
> -    case X86::ATOMUMIN16: Opc = X86::CMOVB16rr; break;
> -    case X86::ATOMUMAX16: Opc = X86::CMOVA16rr; break;
> -    case X86::ATOMMIN64:  Opc = X86::CMOVL64rr; break;
> -    case X86::ATOMMAX64:  Opc = X86::CMOVG64rr; break;
> -    case X86::ATOMUMIN64: Opc = X86::CMOVB64rr; break;
> -    case X86::ATOMUMAX64: Opc = X86::CMOVA64rr; break;
> -    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
> -    }
> -    return EmitAtomicMinMaxWithCustomInserter(MI, BB, Opc);
> -  }
> -
> -  case X86::ATOMAND32:
> -  case X86::ATOMOR32:
> -  case X86::ATOMXOR32:
> -  case X86::ATOMNAND32: {
> -    bool Invert = false;
> -    unsigned RegOpc, ImmOpc;
> -    switch (MI->getOpcode()) {
> -    default: llvm_unreachable("illegal opcode!");
> -    case X86::ATOMAND32:
> -      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; break;
> -    case X86::ATOMOR32:
> -      RegOpc = X86::OR32rr;  ImmOpc = X86::OR32ri; break;
> -    case X86::ATOMXOR32:
> -      RegOpc = X86::XOR32rr; ImmOpc = X86::XOR32ri; break;
> -    case X86::ATOMNAND32:
> -      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; Invert = true; break;
> -    }
> -    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
> -                                               X86::MOV32rm, X86::LCMPXCHG32,
> -                                               X86::NOT32r, X86::EAX,
> -                                               &X86::GR32RegClass, Invert);
> -  }
> -
> +  case X86::ATOMAND8:
>   case X86::ATOMAND16:
> +  case X86::ATOMAND32:
> +  case X86::ATOMAND64:
> +    // Fall through
> +  case X86::ATOMOR8:
>   case X86::ATOMOR16:
> +  case X86::ATOMOR32:
> +  case X86::ATOMOR64:
> +    // Fall through
>   case X86::ATOMXOR16:
> -  case X86::ATOMNAND16: {
> -    bool Invert = false;
> -    unsigned RegOpc, ImmOpc;
> -    switch (MI->getOpcode()) {
> -    default: llvm_unreachable("illegal opcode!");
> -    case X86::ATOMAND16:
> -      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; break;
> -    case X86::ATOMOR16:
> -      RegOpc = X86::OR16rr;  ImmOpc = X86::OR16ri; break;
> -    case X86::ATOMXOR16:
> -      RegOpc = X86::XOR16rr; ImmOpc = X86::XOR16ri; break;
> -    case X86::ATOMNAND16:
> -      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; Invert = true; break;
> -    }
> -    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
> -                                               X86::MOV16rm, X86::LCMPXCHG16,
> -                                               X86::NOT16r, X86::AX,
> -                                               &X86::GR16RegClass, Invert);
> -  }
> -
> -  case X86::ATOMAND8:
> -  case X86::ATOMOR8:
>   case X86::ATOMXOR8:
> -  case X86::ATOMNAND8: {
> -    bool Invert = false;
> -    unsigned RegOpc, ImmOpc;
> -    switch (MI->getOpcode()) {
> -    default: llvm_unreachable("illegal opcode!");
> -    case X86::ATOMAND8:
> -      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; break;
> -    case X86::ATOMOR8:
> -      RegOpc = X86::OR8rr;  ImmOpc = X86::OR8ri; break;
> -    case X86::ATOMXOR8:
> -      RegOpc = X86::XOR8rr; ImmOpc = X86::XOR8ri; break;
> -    case X86::ATOMNAND8:
> -      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; Invert = true; break;
> -    }
> -    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
> -                                               X86::MOV8rm, X86::LCMPXCHG8,
> -                                               X86::NOT8r, X86::AL,
> -                                               &X86::GR8RegClass, Invert);
> -  }
> -
> -  // This group is for 64-bit host.
> -  case X86::ATOMAND64:
> -  case X86::ATOMOR64:
> +  case X86::ATOMXOR32:
>   case X86::ATOMXOR64:
> -  case X86::ATOMNAND64: {
> -    bool Invert = false;
> -    unsigned RegOpc, ImmOpc;
> -    switch (MI->getOpcode()) {
> -    default: llvm_unreachable("illegal opcode!");
> -    case X86::ATOMAND64:
> -      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; break;
> -    case X86::ATOMOR64:
> -      RegOpc = X86::OR64rr;  ImmOpc = X86::OR64ri32; break;
> -    case X86::ATOMXOR64:
> -      RegOpc = X86::XOR64rr; ImmOpc = X86::XOR64ri32; break;
> -    case X86::ATOMNAND64:
> -      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; Invert = true; break;
> -    }
> -    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
> -                                               X86::MOV64rm, X86::LCMPXCHG64,
> -                                               X86::NOT64r, X86::RAX,
> -                                               &X86::GR64RegClass, Invert);
> -  }
> +    // Fall through
> +  case X86::ATOMNAND8:
> +  case X86::ATOMNAND16:
> +  case X86::ATOMNAND32:
> +  case X86::ATOMNAND64:
> +    // Fall through
> +  case X86::ATOMMAX16:
> +  case X86::ATOMMAX32:
> +  case X86::ATOMMAX64:
> +    // Fall through
> +  case X86::ATOMMIN16:
> +  case X86::ATOMMIN32:
> +  case X86::ATOMMIN64:
> +    // Fall through
> +  case X86::ATOMUMAX16:
> +  case X86::ATOMUMAX32:
> +  case X86::ATOMUMAX64:
> +    // Fall through
> +  case X86::ATOMUMIN16:
> +  case X86::ATOMUMIN32:
> +  case X86::ATOMUMIN64:
> +    return EmitAtomicLoadArith(MI, BB);
> 
>   // This group does 64-bit operations on a 32-bit host.
>   case X86::ATOMAND6432:
> @@ -13308,44 +13333,8 @@
>   case X86::ATOMNAND6432:
>   case X86::ATOMADD6432:
>   case X86::ATOMSUB6432:
> -  case X86::ATOMSWAP6432: {
> -    bool Invert = false;
> -    unsigned RegOpcL, RegOpcH, ImmOpcL, ImmOpcH;
> -    switch (MI->getOpcode()) {
> -    default: llvm_unreachable("illegal opcode!");
> -    case X86::ATOMAND6432:
> -      RegOpcL = RegOpcH = X86::AND32rr;
> -      ImmOpcL = ImmOpcH = X86::AND32ri;
> -      break;
> -    case X86::ATOMOR6432:
> -      RegOpcL = RegOpcH = X86::OR32rr;
> -      ImmOpcL = ImmOpcH = X86::OR32ri;
> -      break;
> -    case X86::ATOMXOR6432:
> -      RegOpcL = RegOpcH = X86::XOR32rr;
> -      ImmOpcL = ImmOpcH = X86::XOR32ri;
> -      break;
> -    case X86::ATOMNAND6432:
> -      RegOpcL = RegOpcH = X86::AND32rr;
> -      ImmOpcL = ImmOpcH = X86::AND32ri;
> -      Invert = true;
> -      break;
> -    case X86::ATOMADD6432:
> -      RegOpcL = X86::ADD32rr; RegOpcH = X86::ADC32rr;
> -      ImmOpcL = X86::ADD32ri; ImmOpcH = X86::ADC32ri;
> -      break;
> -    case X86::ATOMSUB6432:
> -      RegOpcL = X86::SUB32rr; RegOpcH = X86::SBB32rr;
> -      ImmOpcL = X86::SUB32ri; ImmOpcH = X86::SBB32ri;
> -      break;
> -    case X86::ATOMSWAP6432:
> -      RegOpcL = RegOpcH = X86::MOV32rr;
> -      ImmOpcL = ImmOpcH = X86::MOV32ri;
> -      break;
> -    }
> -    return EmitAtomicBit6432WithCustomInserter(MI, BB, RegOpcL, RegOpcH,
> -                                               ImmOpcL, ImmOpcH, Invert);
> -  }
> +  case X86::ATOMSWAP6432:
> +    return EmitAtomicLoadArith6432(MI, BB);
> 
>   case X86::VASTART_SAVE_XMM_REGS:
>     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=164281&r1=164280&r2=164281&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Wed Sep 19 22:06:15 2012
> @@ -861,36 +861,17 @@
>                                    MachineBasicBlock *BB) const;
>     MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const;
> 
> -    /// Utility function to emit atomic bitwise operations (and, or, xor).
> -    /// It takes the bitwise instruction to expand, the associated machine basic
> -    /// block, and the associated X86 opcodes for reg/reg and reg/imm.
> -    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
> -                                                    MachineInstr *BInstr,
> -                                                    MachineBasicBlock *BB,
> -                                                    unsigned regOpc,
> -                                                    unsigned immOpc,
> -                                                    unsigned loadOpc,
> -                                                    unsigned cxchgOpc,
> -                                                    unsigned notOpc,
> -                                                    unsigned EAXreg,
> -                                              const TargetRegisterClass *RC,
> -                                                    bool Invert = false) const;
> -
> -    MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
> -                                                    MachineInstr *BInstr,
> -                                                    MachineBasicBlock *BB,
> -                                                    unsigned regOpcL,
> -                                                    unsigned regOpcH,
> -                                                    unsigned immOpcL,
> -                                                    unsigned immOpcH,
> -                                                    bool Invert = false) const;
> -
> -    /// Utility function to emit atomic min and max.  It takes the min/max
> -    /// instruction to expand, the associated basic block, and the associated
> -    /// cmov opcode for moving the min or max value.
> -    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
> -                                                          MachineBasicBlock *BB,
> -                                                        unsigned cmovOpc) const;
> +    /// Utility function to emit atomic-load-arith operations (and, or, xor,
> +    /// nand, max, min, umax, umin). It takes the corresponding instruction to
> +    /// expand, the associated machine basic block, and the associated X86
> +    /// opcodes for reg/reg.
> +    MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
> +                                           MachineBasicBlock *MBB) const;
> +
> +    /// Utility function to emit atomic-load-arith operations (and, or, xor,
> +    /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
> +    MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
> +                                               MachineBasicBlock *MBB) const;
> 
>     // Utility function to emit the low-level va_arg code for X86-64.
>     MachineBasicBlock *EmitVAARG64WithCustomInserter(
> 
> Modified: llvm/trunk/lib/Target/X86/X86InstrCompiler.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrCompiler.td?rev=164281&r1=164280&r2=164281&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrCompiler.td (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td Wed Sep 19 22:06:15 2012
> @@ -483,8 +483,7 @@
> //===----------------------------------------------------------------------===//
> 
> // Atomic exchange, and, or, xor
> -let Constraints = "$val = $dst", Defs = [EFLAGS],
> -                  usesCustomInserter = 1 in {
> +let usesCustomInserter = 1 in {
> 
> def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
>                "#ATOMAND8 PSEUDO!",
> @@ -578,11 +577,7 @@
>                [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
> }
> 
> -let Constraints = "$val1 = $dst1, $val2 = $dst2",
> -                  Defs = [EFLAGS, EAX, EBX, ECX, EDX],
> -                  Uses = [EAX, EBX, ECX, EDX],
> -                  mayLoad = 1, mayStore = 1,
> -                  usesCustomInserter = 1 in {
> +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1 in {
> def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
>                                (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
>                "#ATOMAND6432 PSEUDO!", []>;
> 
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=164281&r1=164280&r2=164281&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Sep 19 22:06:15 2012
> @@ -2266,7 +2266,7 @@
> }
> 
> /// getCondFromCmovOpc - return condition code of a CMov opcode.
> -static X86::CondCode getCondFromCMovOpc(unsigned Opc) {
> +X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
>   switch (Opc) {
>   default: return X86::COND_INVALID;
>   case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
> @@ -3314,7 +3314,7 @@
>         if (OldCC != X86::COND_INVALID)
>           OpcIsSET = true;
>         else
> -          OldCC = getCondFromCMovOpc(Instr.getOpcode());
> +          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
>       }
>       if (OldCC == X86::COND_INVALID) return false;
>     }
> 
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=164281&r1=164280&r2=164281&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Wed Sep 19 22:06:15 2012
> @@ -61,6 +61,9 @@
>   // Turn condition code into conditional branch opcode.
>   unsigned GetCondBranchFromCond(CondCode CC);
> 
> +  // Turn CMov opcode into condition code.
> +  CondCode getCondFromCMovOpc(unsigned Opc);
> +
>   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
>   /// e.g. turning COND_E to COND_NE.
>   CondCode GetOppositeBranchCondition(X86::CondCode CC);
> 
> Modified: llvm/trunk/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll?rev=164281&r1=164280&r2=164281&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll Wed Sep 19 22:06:15 2012
> @@ -7,17 +7,16 @@
> define void @t(i64* nocapture %p) nounwind ssp {
> entry:
> ; CHECK: t:
> -; CHECK: movl $1
> -; CHECK: movl (%ebp), %eax
> -; CHECK: movl 4(%ebp), %edx
> +; CHECK: movl ([[REG:%[a-z]+]]), %eax
> +; CHECK: movl 4([[REG]]), %edx
> ; CHECK: LBB0_1:
> -; CHECK-NOT: movl $1
> -; CHECK-NOT: movl $0
> +; CHECK: movl $1
> ; CHECK: addl
> +; CHECK: movl $0
> ; CHECK: adcl
> ; CHECK: lock
> -; CHECK: cmpxchg8b
> -; CHECK: jne
> +; CHECK-NEXT: cmpxchg8b ([[REG]])
> +; CHECK-NEXT: jne
>   %0 = atomicrmw add i64* %p, i64 1 seq_cst
>   ret void
> }
> 
> Added: llvm/trunk/test/CodeGen/X86/atomic16.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic16.ll?rev=164281&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/atomic16.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/atomic16.ll Wed Sep 19 22:06:15 2012
> @@ -0,0 +1,250 @@
> +; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
> +; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
> +
> + at sc16 = external global i16
> +
> +define void @atomic_fetch_add16() nounwind {
> +; X64:   atomic_fetch_add16
> +; X32:   atomic_fetch_add16
> +entry:
> +; 32-bit
> +  %t1 = atomicrmw add  i16* @sc16, i16 1 acquire
> +; X64:       lock
> +; X64:       incw
> +; X32:       lock
> +; X32:       incw
> +  %t2 = atomicrmw add  i16* @sc16, i16 3 acquire
> +; X64:       lock
> +; X64:       addw $3
> +; X32:       lock
> +; X32:       addw $3
> +  %t3 = atomicrmw add  i16* @sc16, i16 5 acquire
> +; X64:       lock
> +; X64:       xaddw
> +; X32:       lock
> +; X32:       xaddw
> +  %t4 = atomicrmw add  i16* @sc16, i16 %t3 acquire
> +; X64:       lock
> +; X64:       addw
> +; X32:       lock
> +; X32:       addw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_sub16() nounwind {
> +; X64:   atomic_fetch_sub16
> +; X32:   atomic_fetch_sub16
> +  %t1 = atomicrmw sub  i16* @sc16, i16 1 acquire
> +; X64:       lock
> +; X64:       decw
> +; X32:       lock
> +; X32:       decw
> +  %t2 = atomicrmw sub  i16* @sc16, i16 3 acquire
> +; X64:       lock
> +; X64:       subw $3
> +; X32:       lock
> +; X32:       subw $3
> +  %t3 = atomicrmw sub  i16* @sc16, i16 5 acquire
> +; X64:       lock
> +; X64:       xaddw
> +; X32:       lock
> +; X32:       xaddw
> +  %t4 = atomicrmw sub  i16* @sc16, i16 %t3 acquire
> +; X64:       lock
> +; X64:       subw
> +; X32:       lock
> +; X32:       subw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_and16() nounwind {
> +; X64:   atomic_fetch_and16
> +; X32:   atomic_fetch_and16
> +  %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
> +; X64:       lock
> +; X64:       andw $3
> +; X32:       lock
> +; X32:       andw $3
> +  %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
> +; X64:       andw
> +; X64:       lock
> +; X64:       cmpxchgw
> +; X32:       andw
> +; X32:       lock
> +; X32:       cmpxchgw
> +  %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
> +; X64:       lock
> +; X64:       andw
> +; X32:       lock
> +; X32:       andw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_or16() nounwind {
> +; X64:   atomic_fetch_or16
> +; X32:   atomic_fetch_or16
> +  %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
> +; X64:       lock
> +; X64:       orw $3
> +; X32:       lock
> +; X32:       orw $3
> +  %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
> +; X64:       orw
> +; X64:       lock
> +; X64:       cmpxchgw
> +; X32:       orw
> +; X32:       lock
> +; X32:       cmpxchgw
> +  %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
> +; X64:       lock
> +; X64:       orw
> +; X32:       lock
> +; X32:       orw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_xor16() nounwind {
> +; X64:   atomic_fetch_xor16
> +; X32:   atomic_fetch_xor16
> +  %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
> +; X64:       lock
> +; X64:       xorw $3
> +; X32:       lock
> +; X32:       xorw $3
> +  %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
> +; X64:       xorw
> +; X64:       lock
> +; X64:       cmpxchgw
> +; X32:       xorw
> +; X32:       lock
> +; X32:       cmpxchgw
> +  %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
> +; X64:       lock
> +; X64:       xorw
> +; X32:       lock
> +; X32:       xorw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_nand16(i16 %x) nounwind {
> +; X64:   atomic_fetch_nand16
> +; X32:   atomic_fetch_nand16
> +  %t1 = atomicrmw nand i16* @sc16, i16 %x acquire
> +; X64:       andw
> +; X64:       notw
> +; X64:       lock
> +; X64:       cmpxchgw
> +; X32:       andw
> +; X32:       notw
> +; X32:       lock
> +; X32:       cmpxchgw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_max16(i16 %x) nounwind {
> +  %t1 = atomicrmw max  i16* @sc16, i16 %x acquire
> +; X64:       cmpw
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgw
> +
> +; X32:       cmpw
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_min16(i16 %x) nounwind {
> +  %t1 = atomicrmw min  i16* @sc16, i16 %x acquire
> +; X64:       cmpw
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgw
> +
> +; X32:       cmpw
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umax16(i16 %x) nounwind {
> +  %t1 = atomicrmw umax i16* @sc16, i16 %x acquire
> +; X64:       cmpw
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgw
> +
> +; X32:       cmpw
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umin16(i16 %x) nounwind {
> +  %t1 = atomicrmw umin i16* @sc16, i16 %x acquire
> +; X64:       cmpw
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgw
> +; X32:       cmpw
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_cmpxchg16() nounwind {
> +  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
> +; X64:       lock
> +; X64:       cmpxchgw
> +; X32:       lock
> +; X32:       cmpxchgw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_store16(i16 %x) nounwind {
> +  store atomic i16 %x, i16* @sc16 release, align 4
> +; X64-NOT:   lock
> +; X64:       movw
> +; X32-NOT:   lock
> +; X32:       movw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_swap16(i16 %x) nounwind {
> +  %t1 = atomicrmw xchg i16* @sc16, i16 %x acquire
> +; X64-NOT:   lock
> +; X64:       xchgw
> +; X32-NOT:   lock
> +; X32:       xchgw
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> 
> Added: llvm/trunk/test/CodeGen/X86/atomic32.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic32.ll?rev=164281&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/atomic32.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/atomic32.ll Wed Sep 19 22:06:15 2012
> @@ -0,0 +1,250 @@
> +; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
> +; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
> +
> + at sc32 = external global i32
> +
> +define void @atomic_fetch_add32() nounwind {
> +; X64:   atomic_fetch_add32
> +; X32:   atomic_fetch_add32
> +entry:
> +; 32-bit
> +  %t1 = atomicrmw add  i32* @sc32, i32 1 acquire
> +; X64:       lock
> +; X64:       incl
> +; X32:       lock
> +; X32:       incl
> +  %t2 = atomicrmw add  i32* @sc32, i32 3 acquire
> +; X64:       lock
> +; X64:       addl $3
> +; X32:       lock
> +; X32:       addl $3
> +  %t3 = atomicrmw add  i32* @sc32, i32 5 acquire
> +; X64:       lock
> +; X64:       xaddl
> +; X32:       lock
> +; X32:       xaddl
> +  %t4 = atomicrmw add  i32* @sc32, i32 %t3 acquire
> +; X64:       lock
> +; X64:       addl
> +; X32:       lock
> +; X32:       addl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_sub32() nounwind {
> +; X64:   atomic_fetch_sub32
> +; X32:   atomic_fetch_sub32
> +  %t1 = atomicrmw sub  i32* @sc32, i32 1 acquire
> +; X64:       lock
> +; X64:       decl
> +; X32:       lock
> +; X32:       decl
> +  %t2 = atomicrmw sub  i32* @sc32, i32 3 acquire
> +; X64:       lock
> +; X64:       subl $3
> +; X32:       lock
> +; X32:       subl $3
> +  %t3 = atomicrmw sub  i32* @sc32, i32 5 acquire
> +; X64:       lock
> +; X64:       xaddl
> +; X32:       lock
> +; X32:       xaddl
> +  %t4 = atomicrmw sub  i32* @sc32, i32 %t3 acquire
> +; X64:       lock
> +; X64:       subl
> +; X32:       lock
> +; X32:       subl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_and32() nounwind {
> +; X64:   atomic_fetch_and32
> +; X32:   atomic_fetch_and32
> +  %t1 = atomicrmw and  i32* @sc32, i32 3 acquire
> +; X64:       lock
> +; X64:       andl $3
> +; X32:       lock
> +; X32:       andl $3
> +  %t2 = atomicrmw and  i32* @sc32, i32 5 acquire
> +; X64:       andl
> +; X64:       lock
> +; X64:       cmpxchgl
> +; X32:       andl
> +; X32:       lock
> +; X32:       cmpxchgl
> +  %t3 = atomicrmw and  i32* @sc32, i32 %t2 acquire
> +; X64:       lock
> +; X64:       andl
> +; X32:       lock
> +; X32:       andl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_or32() nounwind {
> +; X64:   atomic_fetch_or32
> +; X32:   atomic_fetch_or32
> +  %t1 = atomicrmw or   i32* @sc32, i32 3 acquire
> +; X64:       lock
> +; X64:       orl $3
> +; X32:       lock
> +; X32:       orl $3
> +  %t2 = atomicrmw or   i32* @sc32, i32 5 acquire
> +; X64:       orl
> +; X64:       lock
> +; X64:       cmpxchgl
> +; X32:       orl
> +; X32:       lock
> +; X32:       cmpxchgl
> +  %t3 = atomicrmw or   i32* @sc32, i32 %t2 acquire
> +; X64:       lock
> +; X64:       orl
> +; X32:       lock
> +; X32:       orl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_xor32() nounwind {
> +; X64:   atomic_fetch_xor32
> +; X32:   atomic_fetch_xor32
> +  %t1 = atomicrmw xor  i32* @sc32, i32 3 acquire
> +; X64:       lock
> +; X64:       xorl $3
> +; X32:       lock
> +; X32:       xorl $3
> +  %t2 = atomicrmw xor  i32* @sc32, i32 5 acquire
> +; X64:       xorl
> +; X64:       lock
> +; X64:       cmpxchgl
> +; X32:       xorl
> +; X32:       lock
> +; X32:       cmpxchgl
> +  %t3 = atomicrmw xor  i32* @sc32, i32 %t2 acquire
> +; X64:       lock
> +; X64:       xorl
> +; X32:       lock
> +; X32:       xorl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_nand32(i32 %x) nounwind {
> +; X64:   atomic_fetch_nand32
> +; X32:   atomic_fetch_nand32
> +  %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
> +; X64:       andl
> +; X64:       notl
> +; X64:       lock
> +; X64:       cmpxchgl
> +; X32:       andl
> +; X32:       notl
> +; X32:       lock
> +; X32:       cmpxchgl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_max32(i32 %x) nounwind {
> +  %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
> +; X64:       cmpl
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgl
> +
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_min32(i32 %x) nounwind {
> +  %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
> +; X64:       cmpl
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgl
> +
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umax32(i32 %x) nounwind {
> +  %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
> +; X64:       cmpl
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgl
> +
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umin32(i32 %x) nounwind {
> +  %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
> +; X64:       cmpl
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_cmpxchg32() nounwind {
> +  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
> +; X64:       lock
> +; X64:       cmpxchgl
> +; X32:       lock
> +; X32:       cmpxchgl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_store32(i32 %x) nounwind {
> +  store atomic i32 %x, i32* @sc32 release, align 4
> +; X64-NOT:   lock
> +; X64:       movl
> +; X32-NOT:   lock
> +; X32:       movl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_swap32(i32 %x) nounwind {
> +  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
> +; X64-NOT:   lock
> +; X64:       xchgl
> +; X32-NOT:   lock
> +; X32:       xchgl
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> 
> Added: llvm/trunk/test/CodeGen/X86/atomic64.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic64.ll?rev=164281&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/atomic64.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/atomic64.ll Wed Sep 19 22:06:15 2012
> @@ -0,0 +1,216 @@
> +; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
> +
> + at sc64 = external global i64
> +
> +define void @atomic_fetch_add64() nounwind {
> +; X64:   atomic_fetch_add64
> +entry:
> +  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
> +; X64:       lock
> +; X64:       incq
> +  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
> +; X64:       lock
> +; X64:       addq $3
> +  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
> +; X64:       lock
> +; X64:       xaddq
> +  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
> +; X64:       lock
> +; X64:       addq
> +  ret void
> +; X64:       ret
> +}
> +
> +define void @atomic_fetch_sub64() nounwind {
> +; X64:   atomic_fetch_sub64
> +  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
> +; X64:       lock
> +; X64:       decq
> +  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
> +; X64:       lock
> +; X64:       subq $3
> +  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
> +; X64:       lock
> +; X64:       xaddq
> +  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
> +; X64:       lock
> +; X64:       subq
> +  ret void
> +; X64:       ret
> +}
> +
> +define void @atomic_fetch_and64() nounwind {
> +; X64:   atomic_fetch_and64
> +  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
> +; X64:       lock
> +; X64:       andq $3
> +  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
> +; X64:       andq
> +; X64:       lock
> +; X64:       cmpxchgq
> +  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
> +; X64:       lock
> +; X64:       andq
> +  ret void
> +; X64:       ret
> +}
> +
> +define void @atomic_fetch_or64() nounwind {
> +; X64:   atomic_fetch_or64
> +  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
> +; X64:       lock
> +; X64:       orq $3
> +  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
> +; X64:       orq
> +; X64:       lock
> +; X64:       cmpxchgq
> +  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
> +; X64:       lock
> +; X64:       orq
> +  ret void
> +; X64:       ret
> +}
> +
> +define void @atomic_fetch_xor64() nounwind {
> +; X64:   atomic_fetch_xor64
> +  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
> +; X64:       lock
> +; X64:       xorq $3
> +  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
> +; X64:       xorq
> +; X64:       lock
> +; X64:       cmpxchgq
> +  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
> +; X64:       lock
> +; X64:       xorq
> +  ret void
> +; X64:       ret
> +}
> +
> +define void @atomic_fetch_nand64(i64 %x) nounwind {
> +; X64:   atomic_fetch_nand64
> +; X32:   atomic_fetch_nand64
> +  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
> +; X64:       andq
> +; X64:       notq
> +; X64:       lock
> +; X64:       cmpxchgq
> +; X32:       andl
> +; X32:       andl
> +; X32:       notl
> +; X32:       notl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_max64(i64 %x) nounwind {
> +  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
> +; X64:       cmpq
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgq
> +
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_min64(i64 %x) nounwind {
> +  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
> +; X64:       cmpq
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgq
> +
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umax64(i64 %x) nounwind {
> +  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
> +; X64:       cmpq
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgq
> +
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umin64(i64 %x) nounwind {
> +  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
> +; X64:       cmpq
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgq
> +
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_cmpxchg64() nounwind {
> +  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
> +; X64:       lock
> +; X64:       cmpxchgq
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_store64(i64 %x) nounwind {
> +  store atomic i64 %x, i64* @sc64 release, align 8
> +; X64-NOT:   lock
> +; X64:       movq
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_swap64(i64 %x) nounwind {
> +  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
> +; X64-NOT:   lock
> +; X64:       xchgq
> +; X32:       lock
> +; X32:       xchg8b
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> 
> Added: llvm/trunk/test/CodeGen/X86/atomic6432.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic6432.ll?rev=164281&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/atomic6432.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/atomic6432.ll Wed Sep 19 22:06:15 2012
> @@ -0,0 +1,209 @@
> +; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
> +; XFAIL: *
> +
> + at sc64 = external global i64
> +
> +define void @atomic_fetch_add64() nounwind {
> +; X32:   atomic_fetch_add64
> +entry:
> +  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
> +; X32:       addl
> +; X32:       adcl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
> +; X32:       addl
> +; X32:       adcl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
> +; X32:       addl
> +; X32:       adcl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
> +; X32:       addl
> +; X32:       adcl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_sub64() nounwind {
> +; X32:   atomic_fetch_sub64
> +  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
> +; X32:       subl
> +; X32:       sbbl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
> +; X32:       subl
> +; X32:       sbbl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
> +; X32:       subl
> +; X32:       sbbl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
> +; X32:       subl
> +; X32:       sbbl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_and64() nounwind {
> +; X32:   atomic_fetch_and64
> +  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
> +; X32:       andl
> +; X32:       andl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
> +; X32:       andl
> +; X32:       andl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
> +; X32:       andl
> +; X32:       andl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_or64() nounwind {
> +; X32:   atomic_fetch_or64
> +  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
> +; X32:       orl
> +; X32:       orl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
> +; X32:       orl
> +; X32:       orl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
> +; X32:       orl
> +; X32:       orl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_xor64() nounwind {
> +; X32:   atomic_fetch_xor64
> +  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
> +; X32:       xorl
> +; X32:       xorl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
> +; X32:       xorl
> +; X32:       xorl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
> +; X32:       xorl
> +; X32:       xorl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_nand64(i64 %x) nounwind {
> +; X32:   atomic_fetch_nand64
> +  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
> +; X32:       andl
> +; X32:       andl
> +; X32:       notl
> +; X32:       notl
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_max64(i64 %x) nounwind {
> +  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_min64(i64 %x) nounwind {
> +  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umax64(i64 %x) nounwind {
> +  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umin64(i64 %x) nounwind {
> +  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
> +; X32:       cmpl
> +; X32:       cmpl
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_cmpxchg64() nounwind {
> +  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_store64(i64 %x) nounwind {
> +  store atomic i64 %x, i64* @sc64 release, align 8
> +; X32:       lock
> +; X32:       cmpxchg8b
> +  ret void
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_swap64(i64 %x) nounwind {
> +  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
> +; X32:       lock
> +; X32:       xchg8b
> +  ret void
> +; X32:       ret
> +}
> 
> Added: llvm/trunk/test/CodeGen/X86/atomic8.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic8.ll?rev=164281&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/atomic8.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/atomic8.ll Wed Sep 19 22:06:15 2012
> @@ -0,0 +1,251 @@
> +; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
> +; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
> +; XFAIL: *
> +
> + at sc8 = external global i8
> +
> +define void @atomic_fetch_add8() nounwind {
> +; X64:   atomic_fetch_add8
> +; X32:   atomic_fetch_add8
> +entry:
> +; 32-bit
> +  %t1 = atomicrmw add  i8* @sc8, i8 1 acquire
> +; X64:       lock
> +; X64:       incb
> +; X32:       lock
> +; X32:       incb
> +  %t2 = atomicrmw add  i8* @sc8, i8 3 acquire
> +; X64:       lock
> +; X64:       addb $3
> +; X32:       lock
> +; X32:       addb $3
> +  %t3 = atomicrmw add  i8* @sc8, i8 5 acquire
> +; X64:       lock
> +; X64:       xaddb
> +; X32:       lock
> +; X32:       xaddb
> +  %t4 = atomicrmw add  i8* @sc8, i8 %t3 acquire
> +; X64:       lock
> +; X64:       addb
> +; X32:       lock
> +; X32:       addb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_sub8() nounwind {
> +; X64:   atomic_fetch_sub8
> +; X32:   atomic_fetch_sub8
> +  %t1 = atomicrmw sub  i8* @sc8, i8 1 acquire
> +; X64:       lock
> +; X64:       decb
> +; X32:       lock
> +; X32:       decb
> +  %t2 = atomicrmw sub  i8* @sc8, i8 3 acquire
> +; X64:       lock
> +; X64:       subb $3
> +; X32:       lock
> +; X32:       subb $3
> +  %t3 = atomicrmw sub  i8* @sc8, i8 5 acquire
> +; X64:       lock
> +; X64:       xaddb
> +; X32:       lock
> +; X32:       xaddb
> +  %t4 = atomicrmw sub  i8* @sc8, i8 %t3 acquire
> +; X64:       lock
> +; X64:       subb
> +; X32:       lock
> +; X32:       subb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_and8() nounwind {
> +; X64:   atomic_fetch_and8
> +; X32:   atomic_fetch_and8
> +  %t1 = atomicrmw and  i8* @sc8, i8 3 acquire
> +; X64:       lock
> +; X64:       andb $3
> +; X32:       lock
> +; X32:       andb $3
> +  %t2 = atomicrmw and  i8* @sc8, i8 5 acquire
> +; X64:       andb
> +; X64:       lock
> +; X64:       cmpxchgb
> +; X32:       andb
> +; X32:       lock
> +; X32:       cmpxchgb
> +  %t3 = atomicrmw and  i8* @sc8, i8 %t2 acquire
> +; X64:       lock
> +; X64:       andb
> +; X32:       lock
> +; X32:       andb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_or8() nounwind {
> +; X64:   atomic_fetch_or8
> +; X32:   atomic_fetch_or8
> +  %t1 = atomicrmw or   i8* @sc8, i8 3 acquire
> +; X64:       lock
> +; X64:       orb $3
> +; X32:       lock
> +; X32:       orb $3
> +  %t2 = atomicrmw or   i8* @sc8, i8 5 acquire
> +; X64:       orb
> +; X64:       lock
> +; X64:       cmpxchgb
> +; X32:       orb
> +; X32:       lock
> +; X32:       cmpxchgb
> +  %t3 = atomicrmw or   i8* @sc8, i8 %t2 acquire
> +; X64:       lock
> +; X64:       orb
> +; X32:       lock
> +; X32:       orb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_xor8() nounwind {
> +; X64:   atomic_fetch_xor8
> +; X32:   atomic_fetch_xor8
> +  %t1 = atomicrmw xor  i8* @sc8, i8 3 acquire
> +; X64:       lock
> +; X64:       xorb $3
> +; X32:       lock
> +; X32:       xorb $3
> +  %t2 = atomicrmw xor  i8* @sc8, i8 5 acquire
> +; X64:       xorb
> +; X64:       lock
> +; X64:       cmpxchgb
> +; X32:       xorb
> +; X32:       lock
> +; X32:       cmpxchgb
> +  %t3 = atomicrmw xor  i8* @sc8, i8 %t2 acquire
> +; X64:       lock
> +; X64:       xorb
> +; X32:       lock
> +; X32:       xorb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_nand8(i8 %x) nounwind {
> +; X64:   atomic_fetch_nand8
> +; X32:   atomic_fetch_nand8
> +  %t1 = atomicrmw nand i8* @sc8, i8 %x acquire
> +; X64:       andb
> +; X64:       notb
> +; X64:       lock
> +; X64:       cmpxchgb
> +; X32:       andb
> +; X32:       notb
> +; X32:       lock
> +; X32:       cmpxchgb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_max8(i8 %x) nounwind {
> +  %t1 = atomicrmw max  i8* @sc8, i8 %x acquire
> +; X64:       cmpb
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgb
> +
> +; X32:       cmpb
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_min8(i8 %x) nounwind {
> +  %t1 = atomicrmw min  i8* @sc8, i8 %x acquire
> +; X64:       cmpb
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgb
> +
> +; X32:       cmpb
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umax8(i8 %x) nounwind {
> +  %t1 = atomicrmw umax i8* @sc8, i8 %x acquire
> +; X64:       cmpb
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgb
> +
> +; X32:       cmpb
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_umin8(i8 %x) nounwind {
> +  %t1 = atomicrmw umin i8* @sc8, i8 %x acquire
> +; X64:       cmpb
> +; X64:       cmov
> +; X64:       lock
> +; X64:       cmpxchgb
> +; X32:       cmpb
> +; X32:       cmov
> +; X32:       lock
> +; X32:       cmpxchgb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_cmpxchg8() nounwind {
> +  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
> +; X64:       lock
> +; X64:       cmpxchgb
> +; X32:       lock
> +; X32:       cmpxchgb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_store8(i8 %x) nounwind {
> +  store atomic i8 %x, i8* @sc8 release, align 4
> +; X64-NOT:   lock
> +; X64:       movb
> +; X32-NOT:   lock
> +; X32:       movb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> +
> +define void @atomic_fetch_swap8(i8 %x) nounwind {
> +  %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire
> +; X64-NOT:   lock
> +; X64:       xchgb
> +; X32-NOT:   lock
> +; X32:       xchgb
> +  ret void
> +; X64:       ret
> +; X32:       ret
> +}
> 
> Modified: llvm/trunk/test/CodeGen/X86/atomic_op.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic_op.ll?rev=164281&r1=164280&r2=164281&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/atomic_op.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/atomic_op.ll Wed Sep 19 22:06:15 2012
> @@ -1,4 +1,4 @@
> -; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
> +; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov | FileCheck %s
> 
> target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
> 
> @@ -107,13 +107,12 @@
>         ; CHECK: cmpxchgl
>   %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
> 	store i32 %17, i32* %old
> +        ; CHECK: movl  [[R17atomic:.*]], %eax
>         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
> -        ; CHECK: movl	[[R17atomic:.*]], %eax
> -        ; CHECK: movl	%eax, %[[R17newval:[a-z]*]]
> -        ; CHECK: andl	%[[R17mask]], %[[R17newval]]
> -        ; CHECK: notl	%[[R17newval]]
> +        ; CHECK: andl	%eax, %[[R17mask]]
> +        ; CHECK: notl	%[[R17mask]]
>         ; CHECK: lock
> -        ; CHECK: cmpxchgl	%[[R17newval]], [[R17atomic]]
> +        ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
>         ; CHECK: jne
>         ; CHECK: movl	%eax,
>   %18 = atomicrmw nand i32* %val2, i32 1401 monotonic
> 
> Added: llvm/trunk/test/CodeGen/X86/pr13458.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr13458.ll?rev=164281&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/pr13458.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/pr13458.ll Wed Sep 19 22:06:15 2012
> @@ -0,0 +1,14 @@
> +; RUN: llc < %s
> +
> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-apple-darwin11.4.2"
> +
> +%v8_uniform_Stats.0.2.4.10 = type { i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i64, [7 x i32], [7 x i64] }
> +
> + at globalStats = external global %v8_uniform_Stats.0.2.4.10
> +
> +define void @MergeStats() nounwind {
> +allocas:
> +  %r.i.i720 = atomicrmw max i64* getelementptr inbounds (%v8_uniform_Stats.0.2.4.10* @globalStats, i64 0, i32 30), i64 0 seq_cst
> +  ret void
> +}
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits




More information about the llvm-commits mailing list