[llvm] r244128 - x86 atomic: optimize a.store(reg op a.load(acquire), release)

Wed Aug 5 14:05:00 PDT 2015

Author: jfb
Date: Wed Aug  5 16:04:59 2015
New Revision: 244128

URL: http://llvm.org/viewvc/llvm-project?rev=244128&view=rev
Log:
x86 atomic: optimize a.store(reg op a.load(acquire), release)

Summary: PR24191 finds that the expected memory-register operations aren't generated when relaxed { load ; modify ; store } is used. This is similar to PR17281 which was addressed in D4796, but only for memory-immediate operations (and for memory orderings up to acquire and release). This patch also handles some floating-point operations.

Reviewers: reames, kcc, dvyukov, nadav, morisset, chandlerc, t.p.northover, pete

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D11382

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/lib/Target/X86/X86InstrCompiler.td
    llvm/trunk/lib/Target/X86/X86MCInstLower.cpp
    llvm/trunk/test/CodeGen/X86/atomic_mi.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=244128&r1=244127&r2=244128&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Aug  5 16:04:59 2015
@@ -20133,6 +20133,45 @@ X86TargetLowering::EmitLoweredSelect(Mac
 }
 
 MachineBasicBlock *
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
+                                       MachineBasicBlock *BB) const {
+  // Combine the following atomic floating-point modification pattern:
+  //   a.store(reg OP a.load(acquire), release)
+  // Transform them into:
+  //   OPss (%gpr), %xmm
+  //   movss %xmm, (%gpr)
+  // Or sd equivalent for 64-bit operations.
+  unsigned MOp, FOp;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
+  case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
+  case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
+  }
+  const X86InstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  unsigned MSrc = MI->getOperand(0).getReg();
+  unsigned VSrc = MI->getOperand(5).getReg();
+  MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
+                                .addReg(/*Base=*/MSrc)
+                                .addImm(/*Scale=*/1)
+                                .addReg(/*Index=*/0)
+                                .addImm(0)
+                                .addReg(0);
+  MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
+                              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+                          .addReg(VSrc)
+                          .addReg(/*Base=*/MSrc)
+                          .addImm(/*Scale=*/1)
+                          .addReg(/*Index=*/0)
+                          .addImm(/*Disp=*/0)
+                          .addReg(/*Segment=*/0);
+  MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
@@ -20687,6 +20726,10 @@ X86TargetLowering::EmitInstrWithCustomIn
   case X86::CMOV_V64I1:
     return EmitLoweredSelect(MI, BB);
 
+  case X86::RELEASE_FADD32mr:
+  case X86::RELEASE_FADD64mr:
+    return EmitLoweredAtomicFP(MI, BB);
+
   case X86::FP32_TO_INT16_IN_MEM:
   case X86::FP32_TO_INT32_IN_MEM:
   case X86::FP32_TO_INT64_IN_MEM:

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=244128&r1=244127&r2=244128&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Wed Aug  5 16:04:59 2015
@@ -1080,6 +1080,9 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
                                          MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I,
+                                           MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
                                               MachineBasicBlock *BB) const;
 

Modified: llvm/trunk/lib/Target/X86/X86InstrCompiler.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrCompiler.td?rev=244128&r1=244127&r2=244128&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrCompiler.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td Wed Aug  5 16:04:59 2015
@@ -752,6 +752,8 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc
 
 /* The following multiclass tries to make sure that in code like
  *    x.store (immediate op x.load(acquire), release)
+ * and
+ *    x.store (register op x.load(acquire), release)
  * an operation directly on memory is generated instead of wasting a register.
  * It is not automatic as atomic_store/load are only lowered to MOV instructions
  * extremely late to prevent them from being accidentally reordered in the backend
@@ -759,19 +761,31 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc
  */
 multiclass RELEASE_BINOP_MI<string op> {
     def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
-        "#RELEASE_BINOP PSEUDO!",
+        "#BINOP "#NAME#"8mi PSEUDO!",
         [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
             (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+    def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
+        "#BINOP "#NAME#"8mr PSEUDO!",
+        [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
+            (atomic_load_8 addr:$dst), GR8:$src))]>;
     // NAME#16 is not generated as 16-bit arithmetic instructions are considered
     // costly and avoided as far as possible by this backend anyway
     def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
-        "#RELEASE_BINOP PSEUDO!",
+        "#BINOP "#NAME#"32mi PSEUDO!",
         [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
             (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
+            (atomic_load_32 addr:$dst), GR32:$src))]>;
     def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
-        "#RELEASE_BINOP PSEUDO!",
+        "#BINOP "#NAME#"64mi32 PSEUDO!",
         [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
             (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
+            (atomic_load_64 addr:$dst), GR64:$src))]>;
 }
 defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
 defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
@@ -780,18 +794,41 @@ defm RELEASE_XOR : RELEASE_BINOP_MI<"xor
 // Note: we don't deal with sub, because substractions of constants are
 // optimized into additions before this code can run
 
+// Same as above, but for floating-point.
+// FIXME: imm version.
+// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+let usesCustomInserter = 1 in {
+multiclass RELEASE_FP_BINOP_MI<string op> {
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst,
+	   (i32 (bitconvert (!cast<PatFrag>(op)
+             (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
+	      FR32:$src))))]>, Requires<[HasSSE1]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst,
+	   (i64 (bitconvert (!cast<PatFrag>(op)
+             (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
+	      FR64:$src))))]>, Requires<[HasSSE2]>;
+}
+defm RELEASE_FADD : RELEASE_FP_BINOP_MI<"fadd">;
+// FIXME: Add fsub, fmul, fdiv, ...
+}
+
 multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
     def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"8m PSEUDO!",
         [(atomic_store_8 addr:$dst, dag8)]>;
     def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"16m PSEUDO!",
         [(atomic_store_16 addr:$dst, dag16)]>;
     def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"32m PSEUDO!",
         [(atomic_store_32 addr:$dst, dag32)]>;
     def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
-        "#RELEASE_UNOP PSEUDO!",
+        "#UNOP "#NAME#"64m PSEUDO!",
         [(atomic_store_64 addr:$dst, dag64)]>;
 }
 
@@ -821,42 +858,42 @@ defm RELEASE_NOT : RELEASE_UNOP<
 */
 
 def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV8mi PSEUDO!",
 			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
 def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV16mi PSEUDO!",
 			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
 def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV32mi PSEUDO!",
 			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
 def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
-			"#RELEASE_MOV PSEUDO !",
+			"#RELEASE_MOV64mi32 PSEUDO!",
 			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
 
 def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV8mr PSEUDO!",
                         [(atomic_store_8  addr:$dst, GR8 :$src)]>;
 def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV16mr PSEUDO!",
                         [(atomic_store_16 addr:$dst, GR16:$src)]>;
 def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV32mr PSEUDO!",
                         [(atomic_store_32 addr:$dst, GR32:$src)]>;
 def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
-                        "#RELEASE_MOV PSEUDO!",
+                        "#RELEASE_MOV64mr PSEUDO!",
                         [(atomic_store_64 addr:$dst, GR64:$src)]>;
 
 def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV8rm PSEUDO!",
                       [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
 def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV16rm PSEUDO!",
                       [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
 def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV32rm PSEUDO!",
                       [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
 def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
+                      "#ACQUIRE_MOV64rm PSEUDO!",
                       [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
 
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/X86/X86MCInstLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86MCInstLower.cpp?rev=244128&r1=244127&r2=244128&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86MCInstLower.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86MCInstLower.cpp Wed Aug  5 16:04:59 2015
@@ -598,17 +598,29 @@ ReSimplify:
   case X86::RELEASE_MOV32mi:   OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
   case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
   case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+  case X86::RELEASE_ADD8mr:    OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
   case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+  case X86::RELEASE_ADD32mr:   OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
   case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+  case X86::RELEASE_ADD64mr:   OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
   case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+  case X86::RELEASE_AND8mr:    OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
   case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+  case X86::RELEASE_AND32mr:   OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
   case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+  case X86::RELEASE_AND64mr:   OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
   case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+  case X86::RELEASE_OR8mr:     OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
   case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+  case X86::RELEASE_OR32mr:    OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
   case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+  case X86::RELEASE_OR64mr:    OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
   case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+  case X86::RELEASE_XOR8mr:    OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
   case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+  case X86::RELEASE_XOR32mr:   OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
   case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+  case X86::RELEASE_XOR64mr:   OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
   case X86::RELEASE_INC8m:     OutMI.setOpcode(X86::INC8m); goto ReSimplify;
   case X86::RELEASE_INC16m:    OutMI.setOpcode(X86::INC16m); goto ReSimplify;
   case X86::RELEASE_INC32m:    OutMI.setOpcode(X86::INC32m); goto ReSimplify;

Modified: llvm/trunk/test/CodeGen/X86/atomic_mi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic_mi.ll?rev=244128&r1=244127&r2=244128&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/atomic_mi.ll (original)
+++ llvm/trunk/test/CodeGen/X86/atomic_mi.ll Wed Aug  5 16:04:59 2015
@@ -14,7 +14,11 @@
 ; The binary operations supported are currently add, and, or, xor.
 ; sub is not supported because they are translated by an addition of the
 ; negated immediate.
-; Finally, we also check the same kind of pattern for inc/dec
+;
+; We also check the same patterns:
+; - For inc/dec.
+; - For register instead of immediate operands.
+; - For floating point operations.
 
 ; seq_cst stores are left as (lock) xchgl, but we try to check every other
 ; attribute at least once.
@@ -25,10 +29,10 @@
 ; an implicit lock prefix, so making it explicit is not required.
 
 define void @store_atomic_imm_8(i8* %p) {
-; X64-LABEL: store_atomic_imm_8
+; X64-LABEL: store_atomic_imm_8:
 ; X64: movb
 ; X64-NOT: movb
-; X32-LABEL: store_atomic_imm_8
+; X32-LABEL: store_atomic_imm_8:
 ; X32: movb
 ; X32-NOT: movb
   store atomic i8 42, i8* %p release, align 1
@@ -36,10 +40,10 @@ define void @store_atomic_imm_8(i8* %p)
 }
 
 define void @store_atomic_imm_16(i16* %p) {
-; X64-LABEL: store_atomic_imm_16
+; X64-LABEL: store_atomic_imm_16:
 ; X64: movw
 ; X64-NOT: movw
-; X32-LABEL: store_atomic_imm_16
+; X32-LABEL: store_atomic_imm_16:
 ; X32: movw
 ; X32-NOT: movw
   store atomic i16 42, i16* %p monotonic, align 2
@@ -47,12 +51,12 @@ define void @store_atomic_imm_16(i16* %p
 }
 
 define void @store_atomic_imm_32(i32* %p) {
-; X64-LABEL: store_atomic_imm_32
+; X64-LABEL: store_atomic_imm_32:
 ; X64: movl
 ; X64-NOT: movl
 ;   On 32 bits, there is an extra movl for each of those functions
 ;   (probably for alignment reasons).
-; X32-LABEL: store_atomic_imm_32
+; X32-LABEL: store_atomic_imm_32:
 ; X32: movl 4(%esp), %eax
 ; X32: movl
 ; X32-NOT: movl
@@ -61,12 +65,12 @@ define void @store_atomic_imm_32(i32* %p
 }
 
 define void @store_atomic_imm_64(i64* %p) {
-; X64-LABEL: store_atomic_imm_64
+; X64-LABEL: store_atomic_imm_64:
 ; X64: movq
 ; X64-NOT: movq
 ;   These are implemented with a CAS loop on 32 bit architectures, and thus
 ;   cannot be optimized in the same way as the others.
-; X32-LABEL: store_atomic_imm_64
+; X32-LABEL: store_atomic_imm_64:
 ; X32: cmpxchg8b
   store atomic i64 42, i64* %p release, align 8
   ret void
@@ -75,7 +79,7 @@ define void @store_atomic_imm_64(i64* %p
 ; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
 ; even on X64, one must use movabsq that can only target a register.
 define void @store_atomic_imm_64_big(i64* %p) {
-; X64-LABEL: store_atomic_imm_64_big
+; X64-LABEL: store_atomic_imm_64_big:
 ; X64: movabsq
 ; X64: movq
   store atomic i64 100000000000, i64* %p monotonic, align 8
@@ -84,9 +88,9 @@ define void @store_atomic_imm_64_big(i64
 
 ; It would be incorrect to replace a lock xchgl by a movl
 define void @store_atomic_imm_32_seq_cst(i32* %p) {
-; X64-LABEL: store_atomic_imm_32_seq_cst
+; X64-LABEL: store_atomic_imm_32_seq_cst:
 ; X64: xchgl
-; X32-LABEL: store_atomic_imm_32_seq_cst
+; X32-LABEL: store_atomic_imm_32_seq_cst:
 ; X32: xchgl
   store atomic i32 42, i32* %p seq_cst, align 4
   ret void
@@ -94,12 +98,12 @@ define void @store_atomic_imm_32_seq_cst
 
 ; ----- ADD -----
 
-define void @add_8(i8* %p) {
-; X64-LABEL: add_8
+define void @add_8i(i8* %p) {
+; X64-LABEL: add_8i:
 ; X64-NOT: lock
 ; X64: addb
 ; X64-NOT: movb
-; X32-LABEL: add_8
+; X32-LABEL: add_8i:
 ; X32-NOT: lock
 ; X32: addb
 ; X32-NOT: movb
@@ -109,12 +113,27 @@ define void @add_8(i8* %p) {
   ret void
 }
 
-define void @add_16(i16* %p) {
+define void @add_8r(i8* %p, i8 %v) {
+; X64-LABEL: add_8r:
+; X64-NOT: lock
+; X64: addb
+; X64-NOT: movb
+; X32-LABEL: add_8r:
+; X32-NOT: lock
+; X32: addb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p seq_cst, align 1
+  %2 = add i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @add_16i(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: add_16
+; X64-LABEL: add_16i:
 ; X64-NOT: addw
-; X32-LABEL: add_16
+; X32-LABEL: add_16i:
 ; X32-NOT: addw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = add i16 %1, 2
@@ -122,12 +141,25 @@ define void @add_16(i16* %p) {
   ret void
 }
 
-define void @add_32(i32* %p) {
-; X64-LABEL: add_32
+define void @add_16r(i16* %p, i16 %v) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: add_16r:
+; X64-NOT: addw
+; X32-LABEL: add_16r:
+; X32-NOT: addw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = add i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @add_32i(i32* %p) {
+; X64-LABEL: add_32i:
 ; X64-NOT: lock
 ; X64: addl
 ; X64-NOT: movl
-; X32-LABEL: add_32
+; X32-LABEL: add_32i:
 ; X32-NOT: lock
 ; X32: addl
 ; X32-NOT: movl
@@ -137,23 +169,94 @@ define void @add_32(i32* %p) {
   ret void
 }
 
-define void @add_64(i64* %p) {
-; X64-LABEL: add_64
+define void @add_32r(i32* %p, i32 %v) {
+; X64-LABEL: add_32r:
+; X64-NOT: lock
+; X64: addl
+; X64-NOT: movl
+; X32-LABEL: add_32r:
+; X32-NOT: lock
+; X32: addl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = add i32 %1, %v
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+; The following is a corner case where the load is added to itself. The pattern
+; matching should not fold this. We only test with 32-bit add, but the same
+; applies to other sizes and operations.
+define void @add_32r_self(i32* %p) {
+; X64-LABEL: add_32r_self:
+; X64-NOT: lock
+; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
+; X64: addl %[[R]], %[[R]]
+; X64: movl %[[R]], (%[[M]])
+; X32-LABEL: add_32r_self:
+; X32-NOT: lock
+; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
+; X32: addl %[[R]], %[[R]]
+; X32: movl %[[R]], (%[[M]])
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = add i32 %1, %1
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+; The following is a corner case where the load's result is returned. The
+; optimizer isn't allowed to duplicate the load because it's atomic.
+define i32 @add_32r_ret_load(i32* %p, i32 %v) {
+; X64-LABEL: add_32r_ret_load:
+; X64-NOT: lock
+; X64:      movl (%rdi), %eax
+; X64-NEXT: leal (%rsi,%rax), %ecx
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: retq
+; X32-LABEL: add_32r_ret_load:
+; X32-NOT: lock
+; X32:      movl 4(%esp), %[[P:[a-z]+]]
+; X32-NEXT: movl (%[[P]]),
+; X32-NOT: %[[P]]
+; More code here, we just don't want it to load from P.
+; X32: movl %{{.*}}, (%[[P]])
+; X32-NEXT: retl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = add i32 %1, %v
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret i32 %1
+}
+
+define void @add_64i(i64* %p) {
+; X64-LABEL: add_64i:
 ; X64-NOT: lock
 ; X64: addq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'addq'.
-; X32-LABEL: add_64
+; X32-LABEL: add_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = add i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @add_32_seq_cst(i32* %p) {
-; X64-LABEL: add_32_seq_cst
+define void @add_64r(i64* %p, i64 %v) {
+; X64-LABEL: add_64r:
+; X64-NOT: lock
+; X64: addq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'addq'.
+; X32-LABEL: add_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = add i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @add_32i_seq_cst(i32* %p) {
+; X64-LABEL: add_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: add_32_seq_cst
+; X32-LABEL: add_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = add i32 %1, 2
@@ -161,14 +264,25 @@ define void @add_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @add_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: add_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: add_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = add i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- AND -----
 
-define void @and_8(i8* %p) {
-; X64-LABEL: and_8
+define void @and_8i(i8* %p) {
+; X64-LABEL: and_8i:
 ; X64-NOT: lock
 ; X64: andb
 ; X64-NOT: movb
-; X32-LABEL: and_8
+; X32-LABEL: and_8i:
 ; X32-NOT: lock
 ; X32: andb
 ; X32-NOT: movb
@@ -178,12 +292,27 @@ define void @and_8(i8* %p) {
   ret void
 }
 
-define void @and_16(i16* %p) {
+define void @and_8r(i8* %p, i8 %v) {
+; X64-LABEL: and_8r:
+; X64-NOT: lock
+; X64: andb
+; X64-NOT: movb
+; X32-LABEL: and_8r:
+; X32-NOT: lock
+; X32: andb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p monotonic, align 1
+  %2 = and i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @and_16i(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: and_16
+; X64-LABEL: and_16i:
 ; X64-NOT: andw
-; X32-LABEL: and_16
+; X32-LABEL: and_16i:
 ; X32-NOT: andw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = and i16 %1, 2
@@ -191,12 +320,25 @@ define void @and_16(i16* %p) {
   ret void
 }
 
-define void @and_32(i32* %p) {
-; X64-LABEL: and_32
+define void @and_16r(i16* %p, i16 %v) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: and_16r:
+; X64-NOT: andw
+; X32-LABEL: and_16r:
+; X32-NOT: andw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = and i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @and_32i(i32* %p) {
+; X64-LABEL: and_32i:
 ; X64-NOT: lock
 ; X64: andl
 ; X64-NOT: movl
-; X32-LABEL: and_32
+; X32-LABEL: and_32i:
 ; X32-NOT: lock
 ; X32: andl
 ; X32-NOT: movl
@@ -206,23 +348,51 @@ define void @and_32(i32* %p) {
   ret void
 }
 
-define void @and_64(i64* %p) {
-; X64-LABEL: and_64
+define void @and_32r(i32* %p, i32 %v) {
+; X64-LABEL: and_32r:
+; X64-NOT: lock
+; X64: andl
+; X64-NOT: movl
+; X32-LABEL: and_32r:
+; X32-NOT: lock
+; X32: andl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = and i32 %1, %v
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @and_64i(i64* %p) {
+; X64-LABEL: and_64i:
 ; X64-NOT: lock
 ; X64: andq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'andq'.
-; X32-LABEL: and_64
+; X32-LABEL: and_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = and i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @and_32_seq_cst(i32* %p) {
-; X64-LABEL: and_32_seq_cst
+define void @and_64r(i64* %p, i64 %v) {
+; X64-LABEL: and_64r:
+; X64-NOT: lock
+; X64: andq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'andq'.
+; X32-LABEL: and_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = and i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @and_32i_seq_cst(i32* %p) {
+; X64-LABEL: and_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: and_32_seq_cst
+; X32-LABEL: and_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = and i32 %1, 2
@@ -230,14 +400,25 @@ define void @and_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @and_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: and_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: and_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = and i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- OR -----
 
-define void @or_8(i8* %p) {
-; X64-LABEL: or_8
+define void @or_8i(i8* %p) {
+; X64-LABEL: or_8i:
 ; X64-NOT: lock
 ; X64: orb
 ; X64-NOT: movb
-; X32-LABEL: or_8
+; X32-LABEL: or_8i:
 ; X32-NOT: lock
 ; X32: orb
 ; X32-NOT: movb
@@ -247,10 +428,25 @@ define void @or_8(i8* %p) {
   ret void
 }
 
-define void @or_16(i16* %p) {
-; X64-LABEL: or_16
+define void @or_8r(i8* %p, i8 %v) {
+; X64-LABEL: or_8r:
+; X64-NOT: lock
+; X64: orb
+; X64-NOT: movb
+; X32-LABEL: or_8r:
+; X32-NOT: lock
+; X32: orb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p acquire, align 1
+  %2 = or i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @or_16i(i16* %p) {
+; X64-LABEL: or_16i:
 ; X64-NOT: orw
-; X32-LABEL: or_16
+; X32-LABEL: or_16i:
 ; X32-NOT: orw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = or i16 %1, 2
@@ -258,12 +454,23 @@ define void @or_16(i16* %p) {
   ret void
 }
 
-define void @or_32(i32* %p) {
-; X64-LABEL: or_32
+define void @or_16r(i16* %p, i16 %v) {
+; X64-LABEL: or_16r:
+; X64-NOT: orw
+; X32-LABEL: or_16r:
+; X32-NOT: orw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = or i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @or_32i(i32* %p) {
+; X64-LABEL: or_32i:
 ; X64-NOT: lock
 ; X64: orl
 ; X64-NOT: movl
-; X32-LABEL: or_32
+; X32-LABEL: or_32i:
 ; X32-NOT: lock
 ; X32: orl
 ; X32-NOT: movl
@@ -273,23 +480,51 @@ define void @or_32(i32* %p) {
   ret void
 }
 
-define void @or_64(i64* %p) {
-; X64-LABEL: or_64
+define void @or_32r(i32* %p, i32 %v) {
+; X64-LABEL: or_32r:
+; X64-NOT: lock
+; X64: orl
+; X64-NOT: movl
+; X32-LABEL: or_32r:
+; X32-NOT: lock
+; X32: orl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = or i32 %1, %v
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @or_64i(i64* %p) {
+; X64-LABEL: or_64i:
 ; X64-NOT: lock
 ; X64: orq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'orq'.
-; X32-LABEL: or_64
+; X32-LABEL: or_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = or i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @or_32_seq_cst(i32* %p) {
-; X64-LABEL: or_32_seq_cst
+define void @or_64r(i64* %p, i64 %v) {
+; X64-LABEL: or_64r:
+; X64-NOT: lock
+; X64: orq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'orq'.
+; X32-LABEL: or_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = or i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @or_32i_seq_cst(i32* %p) {
+; X64-LABEL: or_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: or_32_seq_cst
+; X32-LABEL: or_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = or i32 %1, 2
@@ -297,14 +532,25 @@ define void @or_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @or_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: or_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: or_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = or i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- XOR -----
 
-define void @xor_8(i8* %p) {
-; X64-LABEL: xor_8
+define void @xor_8i(i8* %p) {
+; X64-LABEL: xor_8i:
 ; X64-NOT: lock
 ; X64: xorb
 ; X64-NOT: movb
-; X32-LABEL: xor_8
+; X32-LABEL: xor_8i:
 ; X32-NOT: lock
 ; X32: xorb
 ; X32-NOT: movb
@@ -314,10 +560,25 @@ define void @xor_8(i8* %p) {
   ret void
 }
 
-define void @xor_16(i16* %p) {
-; X64-LABEL: xor_16
+define void @xor_8r(i8* %p, i8 %v) {
+; X64-LABEL: xor_8r:
+; X64-NOT: lock
+; X64: xorb
+; X64-NOT: movb
+; X32-LABEL: xor_8r:
+; X32-NOT: lock
+; X32: xorb
+; X32-NOT: movb
+  %1 = load atomic i8, i8* %p acquire, align 1
+  %2 = xor i8 %1, %v
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @xor_16i(i16* %p) {
+; X64-LABEL: xor_16i:
 ; X64-NOT: xorw
-; X32-LABEL: xor_16
+; X32-LABEL: xor_16i:
 ; X32-NOT: xorw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = xor i16 %1, 2
@@ -325,12 +586,23 @@ define void @xor_16(i16* %p) {
   ret void
 }
 
-define void @xor_32(i32* %p) {
-; X64-LABEL: xor_32
+define void @xor_16r(i16* %p, i16 %v) {
+; X64-LABEL: xor_16r:
+; X64-NOT: xorw
+; X32-LABEL: xor_16r:
+; X32-NOT: xorw [.*], (
+  %1 = load atomic i16, i16* %p acquire, align 2
+  %2 = xor i16 %1, %v
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @xor_32i(i32* %p) {
+; X64-LABEL: xor_32i:
 ; X64-NOT: lock
 ; X64: xorl
 ; X64-NOT: movl
-; X32-LABEL: xor_32
+; X32-LABEL: xor_32i:
 ; X32-NOT: lock
 ; X32: xorl
 ; X32-NOT: movl
@@ -340,23 +612,51 @@ define void @xor_32(i32* %p) {
   ret void
 }
 
-define void @xor_64(i64* %p) {
-; X64-LABEL: xor_64
+define void @xor_32r(i32* %p, i32 %v) {
+; X64-LABEL: xor_32r:
+; X64-NOT: lock
+; X64: xorl
+; X64-NOT: movl
+; X32-LABEL: xor_32r:
+; X32-NOT: lock
+; X32: xorl
+; X32-NOT: movl
+  %1 = load atomic i32, i32* %p acquire, align 4
+  %2 = xor i32 %1, %v
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @xor_64i(i64* %p) {
+; X64-LABEL: xor_64i:
 ; X64-NOT: lock
 ; X64: xorq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'xorq'.
-; X32-LABEL: xor_64
+; X32-LABEL: xor_64i:
   %1 = load atomic i64, i64* %p acquire, align 8
   %2 = xor i64 %1, 2
   store atomic i64 %2, i64* %p release, align 8
   ret void
 }
 
-define void @xor_32_seq_cst(i32* %p) {
-; X64-LABEL: xor_32_seq_cst
+define void @xor_64r(i64* %p, i64 %v) {
+; X64-LABEL: xor_64r:
+; X64-NOT: lock
+; X64: xorq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'xorq'.
+; X32-LABEL: xor_64r:
+  %1 = load atomic i64, i64* %p acquire, align 8
+  %2 = xor i64 %1, %v
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @xor_32i_seq_cst(i32* %p) {
+; X64-LABEL: xor_32i_seq_cst:
 ; X64: xchgl
-; X32-LABEL: xor_32_seq_cst
+; X32-LABEL: xor_32i_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = xor i32 %1, 2
@@ -364,18 +664,29 @@ define void @xor_32_seq_cst(i32* %p) {
   ret void
 }
 
+define void @xor_32r_seq_cst(i32* %p, i32 %v) {
+; X64-LABEL: xor_32r_seq_cst:
+; X64: xchgl
+; X32-LABEL: xor_32r_seq_cst:
+; X32: xchgl
+  %1 = load atomic i32, i32* %p monotonic, align 4
+  %2 = xor i32 %1, %v
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
 ; ----- INC -----
 
 define void @inc_8(i8* %p) {
-; X64-LABEL: inc_8
+; X64-LABEL: inc_8:
 ; X64-NOT: lock
 ; X64: incb
 ; X64-NOT: movb
-; X32-LABEL: inc_8
+; X32-LABEL: inc_8:
 ; X32-NOT: lock
 ; X32: incb
 ; X32-NOT: movb
-; SLOW_INC-LABEL: inc_8
+; SLOW_INC-LABEL: inc_8:
 ; SLOW_INC-NOT: incb
 ; SLOW_INC-NOT: movb
   %1 = load atomic i8, i8* %p seq_cst, align 1
@@ -387,11 +698,11 @@ define void @inc_8(i8* %p) {
 define void @inc_16(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: inc_16
+; X64-LABEL: inc_16:
 ; X64-NOT: incw
-; X32-LABEL: inc_16
+; X32-LABEL: inc_16:
 ; X32-NOT: incw
-; SLOW_INC-LABEL: inc_16
+; SLOW_INC-LABEL: inc_16:
 ; SLOW_INC-NOT: incw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = add i16 %1, 1
@@ -400,15 +711,15 @@ define void @inc_16(i16* %p) {
 }
 
 define void @inc_32(i32* %p) {
-; X64-LABEL: inc_32
+; X64-LABEL: inc_32:
 ; X64-NOT: lock
 ; X64: incl
 ; X64-NOT: movl
-; X32-LABEL: inc_32
+; X32-LABEL: inc_32:
 ; X32-NOT: lock
 ; X32: incl
 ; X32-NOT: movl
-; SLOW_INC-LABEL: inc_32
+; SLOW_INC-LABEL: inc_32:
 ; SLOW_INC-NOT: incl
 ; SLOW_INC-NOT: movl
   %1 = load atomic i32, i32* %p acquire, align 4
@@ -418,13 +729,13 @@ define void @inc_32(i32* %p) {
 }
 
 define void @inc_64(i64* %p) {
-; X64-LABEL: inc_64
+; X64-LABEL: inc_64:
 ; X64-NOT: lock
 ; X64: incq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'incq'.
-; X32-LABEL: inc_64
-; SLOW_INC-LABEL: inc_64
+; X32-LABEL: inc_64:
+; SLOW_INC-LABEL: inc_64:
 ; SLOW_INC-NOT: incq
 ; SLOW_INC-NOT: movq
   %1 = load atomic i64, i64* %p acquire, align 8
@@ -434,9 +745,9 @@ define void @inc_64(i64* %p) {
 }
 
 define void @inc_32_seq_cst(i32* %p) {
-; X64-LABEL: inc_32_seq_cst
+; X64-LABEL: inc_32_seq_cst:
 ; X64: xchgl
-; X32-LABEL: inc_32_seq_cst
+; X32-LABEL: inc_32_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = add i32 %1, 1
@@ -447,15 +758,15 @@ define void @inc_32_seq_cst(i32* %p) {
 ; ----- DEC -----
 
 define void @dec_8(i8* %p) {
-; X64-LABEL: dec_8
+; X64-LABEL: dec_8:
 ; X64-NOT: lock
 ; X64: decb
 ; X64-NOT: movb
-; X32-LABEL: dec_8
+; X32-LABEL: dec_8:
 ; X32-NOT: lock
 ; X32: decb
 ; X32-NOT: movb
-; SLOW_INC-LABEL: dec_8
+; SLOW_INC-LABEL: dec_8:
 ; SLOW_INC-NOT: decb
 ; SLOW_INC-NOT: movb
   %1 = load atomic i8, i8* %p seq_cst, align 1
@@ -467,11 +778,11 @@ define void @dec_8(i8* %p) {
 define void @dec_16(i16* %p) {
 ;   Currently the transformation is not done on 16 bit accesses, as the backend
 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
-; X64-LABEL: dec_16
+; X64-LABEL: dec_16:
 ; X64-NOT: decw
-; X32-LABEL: dec_16
+; X32-LABEL: dec_16:
 ; X32-NOT: decw
-; SLOW_INC-LABEL: dec_16
+; SLOW_INC-LABEL: dec_16:
 ; SLOW_INC-NOT: decw
   %1 = load atomic i16, i16* %p acquire, align 2
   %2 = sub i16 %1, 1
@@ -480,15 +791,15 @@ define void @dec_16(i16* %p) {
 }
 
 define void @dec_32(i32* %p) {
-; X64-LABEL: dec_32
+; X64-LABEL: dec_32:
 ; X64-NOT: lock
 ; X64: decl
 ; X64-NOT: movl
-; X32-LABEL: dec_32
+; X32-LABEL: dec_32:
 ; X32-NOT: lock
 ; X32: decl
 ; X32-NOT: movl
-; SLOW_INC-LABEL: dec_32
+; SLOW_INC-LABEL: dec_32:
 ; SLOW_INC-NOT: decl
 ; SLOW_INC-NOT: movl
   %1 = load atomic i32, i32* %p acquire, align 4
@@ -498,13 +809,13 @@ define void @dec_32(i32* %p) {
 }
 
 define void @dec_64(i64* %p) {
-; X64-LABEL: dec_64
+; X64-LABEL: dec_64:
 ; X64-NOT: lock
 ; X64: decq
 ; X64-NOT: movq
 ;   We do not check X86-32 as it cannot do 'decq'.
-; X32-LABEL: dec_64
-; SLOW_INC-LABEL: dec_64
+; X32-LABEL: dec_64:
+; SLOW_INC-LABEL: dec_64:
 ; SLOW_INC-NOT: decq
 ; SLOW_INC-NOT: movq
   %1 = load atomic i64, i64* %p acquire, align 8
@@ -514,12 +825,49 @@ define void @dec_64(i64* %p) {
 }
 
 define void @dec_32_seq_cst(i32* %p) {
-; X64-LABEL: dec_32_seq_cst
+; X64-LABEL: dec_32_seq_cst:
 ; X64: xchgl
-; X32-LABEL: dec_32_seq_cst
+; X32-LABEL: dec_32_seq_cst:
 ; X32: xchgl
   %1 = load atomic i32, i32* %p monotonic, align 4
   %2 = sub i32 %1, 1
   store atomic i32 %2, i32* %p seq_cst, align 4
   ret void
 }
+
+; ----- FADD -----
+
+define void @fadd_32r(float* %loc, float %val) {
+; X64-LABEL: fadd_32r:
+; X64-NOT: lock
+; X64-NOT: mov
+; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: movss %[[XMM]], (%[[M]])
+; X32-LABEL: fadd_32r:
+; Don't check x86-32.
+; LLVM's SSE handling is conservative on x86-32 even without using atomics.
+  %floc = bitcast float* %loc to i32*
+  %1 = load atomic i32, i32* %floc seq_cst, align 4
+  %2 = bitcast i32 %1 to float
+  %add = fadd float %2, %val
+  %3 = bitcast float %add to i32
+  store atomic i32 %3, i32* %floc release, align 4
+  ret void
+}
+
+define void @fadd_64r(double* %loc, double %val) {
+; X64-LABEL: fadd_64r:
+; X64-NOT: lock
+; X64-NOT: mov
+; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: movsd %[[XMM]], (%[[M]])
+; X32-LABEL: fadd_64r:
+; Don't check x86-32 (see comment above).
+  %floc = bitcast double* %loc to i64*
+  %1 = load atomic i64, i64* %floc seq_cst, align 8
+  %2 = bitcast i64 %1 to double
+  %add = fadd double %2, %val
+  %3 = bitcast double %add to i64
+  store atomic i64 %3, i64* %floc release, align 8
+  ret void
+}