[llvm] a7115d5 - [X86] X86CallFrameOptimization - generalize slow push code path

Sun Mar 29 03:03:44 PDT 2020

Author: Simon Pilgrim
Date: 2020-03-29T11:01:59+01:00
New Revision: a7115d51be09ebc8953a269d26bda3d0c50dbab2

URL: https://github.com/llvm/llvm-project/commit/a7115d51be09ebc8953a269d26bda3d0c50dbab2
DIFF: https://github.com/llvm/llvm-project/commit/a7115d51be09ebc8953a269d26bda3d0c50dbab2.diff

LOG: [X86] X86CallFrameOptimization - generalize slow push code path

Replace the explicit isAtom() || isSLM() test with the more general (and more specific) slowTwoMemOps() check to avoid the use of the PUSHrmm push from memory case.

This is actually very tricky to test in anything but quite complex code, but the atomic-idempotent.ll tests seem to be the most straightforward to use.

Differential Revision: https://reviews.llvm.org/D76239

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86CallFrameOptimization.cpp
    llvm/test/CodeGen/X86/atomic-idempotent.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index a5831bc8ef0b..1d42dd77016e 100644

--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -549,7 +549,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 
       // If PUSHrmm is not slow on this target, try to fold the source of the
       // push into the instruction.
-      bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
+      bool SlowPUSHrmm = STI->slowTwoMemOps();
 
       // Check that this is legal to fold. Right now, we're extremely
       // conservative about that.

diff  --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index 50c51fc0555c..5d93e23b3015 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -1,6 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs                           | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs           -mattr=+sse2      | FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SSE2
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=slm -mattr=-sse2      | FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=goldmont -mattr=-sse2 | FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=knl -mattr=-sse2      | FileCheck %s --check-prefixes=CHECK,X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=atom -mattr=-sse2     | FileCheck %s --check-prefixes=CHECK,X86,X86-ATOM
 
 ; On x86, an atomic rmw operation that does not modify the value in memory
 ; (such as atomic add 0) can be replaced by an mfence followed by a mov.
@@ -14,12 +18,30 @@ define i8 @add8(i8* %p) {
 ; X64-NEXT:    movb (%rdi), %al
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: add8:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mfence
-; X86-NEXT:    movb (%eax), %al
-; X86-NEXT:    retl
+; X86-SSE2-LABEL: add8:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movb (%eax), %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SLM-LABEL: add8:
+; X86-SLM:       # %bb.0:
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLM-NEXT:    xorl %eax, %eax
+; X86-SLM-NEXT:    lock xaddb %al, (%ecx)
+; X86-SLM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-SLM-NEXT:    retl
+;
+; X86-ATOM-LABEL: add8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    xorl %eax, %eax
+; X86-ATOM-NEXT:    lock xaddb %al, (%ecx)
+; X86-ATOM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   %1 = atomicrmw add i8* %p, i8 0 monotonic
   ret i8 %1
 }
@@ -31,12 +53,36 @@ define i16 @or16(i16* %p) {
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: or16:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mfence
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    retl
+; X86-SSE2-LABEL: or16:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movzwl (%eax), %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SLM-LABEL: or16:
+; X86-SLM:       # %bb.0:
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLM-NEXT:    movzwl (%ecx), %eax
+; X86-SLM-NEXT:    .p2align 4, 0x90
+; X86-SLM-NEXT:  .LBB1_1: # %atomicrmw.start
+; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-SLM-NEXT:    lock cmpxchgw %ax, (%ecx)
+; X86-SLM-NEXT:    jne .LBB1_1
+; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    retl
+;
+; X86-ATOM-LABEL: or16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movzwl (%ecx), %eax
+; X86-ATOM-NEXT:    .p2align 4, 0x90
+; X86-ATOM-NEXT:  .LBB1_1: # %atomicrmw.start
+; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-ATOM-NEXT:    lock cmpxchgw %ax, (%ecx)
+; X86-ATOM-NEXT:    jne .LBB1_1
+; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    retl
   %1 = atomicrmw or i16* %p, i16 0 acquire
   ret i16 %1
 }
@@ -48,12 +94,36 @@ define i32 @xor32(i32* %p) {
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: xor32:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mfence
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    retl
+; X86-SSE2-LABEL: xor32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SLM-LABEL: xor32:
+; X86-SLM:       # %bb.0:
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLM-NEXT:    movl (%ecx), %eax
+; X86-SLM-NEXT:    .p2align 4, 0x90
+; X86-SLM-NEXT:  .LBB2_1: # %atomicrmw.start
+; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-SLM-NEXT:    jne .LBB2_1
+; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    retl
+;
+; X86-ATOM-LABEL: xor32:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movl (%ecx), %eax
+; X86-ATOM-NEXT:    .p2align 4, 0x90
+; X86-ATOM-NEXT:  .LBB2_1: # %atomicrmw.start
+; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-ATOM-NEXT:    jne .LBB2_1
+; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    retl
   %1 = atomicrmw xor i32* %p, i32 0 release
   ret i32 %1
 }
@@ -105,44 +175,124 @@ define i128 @or128(i128* %p) {
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: or128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_offset %esi, -16
-; X86-NEXT:    .cfi_offset %edi, -12
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %esp, %eax
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __sync_fetch_and_or_16
-; X86-NEXT:    addl $20, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
-; X86-NEXT:    retl $4
+; X86-SSE2-LABEL: or128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    .cfi_offset %esi, -16
+; X86-SSE2-NEXT:    .cfi_offset %edi, -12
+; X86-SSE2-NEXT:    movl 8(%ebp), %esi
+; X86-SSE2-NEXT:    movl %esp, %eax
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    calll __sync_fetch_and_or_16
+; X86-SSE2-NEXT:    addl $20, %esp
+; X86-SSE2-NEXT:    movl (%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SSE2-NEXT:    movl %edi, 8(%esi)
+; X86-SSE2-NEXT:    movl %edx, 12(%esi)
+; X86-SSE2-NEXT:    movl %eax, (%esi)
+; X86-SSE2-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE2-NEXT:    movl %esi, %eax
+; X86-SSE2-NEXT:    leal -8(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
+; X86-SSE2-NEXT:    retl $4
+;
+; X86-SLM-LABEL: or128:
+; X86-SLM:       # %bb.0:
+; X86-SLM-NEXT:    pushl %ebp
+; X86-SLM-NEXT:    .cfi_def_cfa_offset 8
+; X86-SLM-NEXT:    .cfi_offset %ebp, -8
+; X86-SLM-NEXT:    movl %esp, %ebp
+; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
+; X86-SLM-NEXT:    pushl %edi
+; X86-SLM-NEXT:    pushl %esi
+; X86-SLM-NEXT:    andl $-8, %esp
+; X86-SLM-NEXT:    subl $16, %esp
+; X86-SLM-NEXT:    .cfi_offset %esi, -16
+; X86-SLM-NEXT:    .cfi_offset %edi, -12
+; X86-SLM-NEXT:    movl 8(%ebp), %esi
+; X86-SLM-NEXT:    movl 12(%ebp), %eax
+; X86-SLM-NEXT:    movl %esp, %ecx
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl %eax
+; X86-SLM-NEXT:    pushl %ecx
+; X86-SLM-NEXT:    calll __sync_fetch_and_or_16
+; X86-SLM-NEXT:    addl $20, %esp
+; X86-SLM-NEXT:    movl (%esp), %eax
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLM-NEXT:    movl %edi, 8(%esi)
+; X86-SLM-NEXT:    movl %edx, 12(%esi)
+; X86-SLM-NEXT:    movl %eax, (%esi)
+; X86-SLM-NEXT:    movl %ecx, 4(%esi)
+; X86-SLM-NEXT:    movl %esi, %eax
+; X86-SLM-NEXT:    leal -8(%ebp), %esp
+; X86-SLM-NEXT:    popl %esi
+; X86-SLM-NEXT:    popl %edi
+; X86-SLM-NEXT:    popl %ebp
+; X86-SLM-NEXT:    .cfi_def_cfa %esp, 4
+; X86-SLM-NEXT:    retl $4
+;
+; X86-ATOM-LABEL: or128:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    pushl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
+; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
+; X86-ATOM-NEXT:    leal (%esp), %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
+; X86-ATOM-NEXT:    pushl %edi
+; X86-ATOM-NEXT:    pushl %esi
+; X86-ATOM-NEXT:    andl $-8, %esp
+; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    .cfi_offset %esi, -16
+; X86-ATOM-NEXT:    .cfi_offset %edi, -12
+; X86-ATOM-NEXT:    movl 8(%ebp), %esi
+; X86-ATOM-NEXT:    movl 12(%ebp), %eax
+; X86-ATOM-NEXT:    movl %esp, %ecx
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    pushl %ecx
+; X86-ATOM-NEXT:    calll __sync_fetch_and_or_16
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    movl (%esp), %ecx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-ATOM-NEXT:    movl %eax, 8(%esi)
+; X86-ATOM-NEXT:    movl %edi, 12(%esi)
+; X86-ATOM-NEXT:    movl %ecx, (%esi)
+; X86-ATOM-NEXT:    movl %esi, %eax
+; X86-ATOM-NEXT:    movl %edx, 4(%esi)
+; X86-ATOM-NEXT:    leal -8(%ebp), %esp
+; X86-ATOM-NEXT:    popl %esi
+; X86-ATOM-NEXT:    popl %edi
+; X86-ATOM-NEXT:    popl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
+; X86-ATOM-NEXT:    retl $4
   %1 = atomicrmw or i128* %p, i128 0 monotonic
   ret i128 %1
 }
@@ -155,49 +305,137 @@ define i32 @and32 (i32* %p) {
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: and32:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mfence
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    retl
+; X86-SSE2-LABEL: and32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SLM-LABEL: and32:
+; X86-SLM:       # %bb.0:
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLM-NEXT:    movl (%ecx), %eax
+; X86-SLM-NEXT:    .p2align 4, 0x90
+; X86-SLM-NEXT:  .LBB5_1: # %atomicrmw.start
+; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-SLM-NEXT:    jne .LBB5_1
+; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    retl
+;
+; X86-ATOM-LABEL: and32:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movl (%ecx), %eax
+; X86-ATOM-NEXT:    .p2align 4, 0x90
+; X86-ATOM-NEXT:  .LBB5_1: # %atomicrmw.start
+; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-ATOM-NEXT:    jne .LBB5_1
+; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    retl
   %1 = atomicrmw and i32* %p, i32 -1 acq_rel
   ret i32 %1
 }
 
 define void @or32_nouse_monotonic(i32* %p) {
-; CHECK-LABEL: or32_nouse_monotonic:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    #MEMBARRIER
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: or32_nouse_monotonic:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_monotonic:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_monotonic:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 monotonic
   ret void
 }
 
 
 define void @or32_nouse_acquire(i32* %p) {
-; CHECK-LABEL: or32_nouse_acquire:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    #MEMBARRIER
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: or32_nouse_acquire:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_acquire:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_acquire:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 acquire
   ret void
 }
 
 define void @or32_nouse_release(i32* %p) {
-; CHECK-LABEL: or32_nouse_release:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    #MEMBARRIER
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: or32_nouse_release:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_release:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_release:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 release
   ret void
 }
 
 define void @or32_nouse_acq_rel(i32* %p) {
-; CHECK-LABEL: or32_nouse_acq_rel:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    #MEMBARRIER
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: or32_nouse_acq_rel:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_acq_rel:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_acq_rel:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 acq_rel
   ret void
 }
@@ -208,10 +446,21 @@ define void @or32_nouse_seq_cst(i32* %p) {
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: or32_nouse_seq_cst:
-; X86:       # %bb.0:
-; X86-NEXT:    lock orl $0, (%esp)
-; X86-NEXT:    retl
+; X86-GENERIC-LABEL: or32_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    lock orl $0, (%esp)
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 seq_cst
   ret void
 }
@@ -264,28 +513,76 @@ define void @or128_nouse_seq_cst(i128* %p) {
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: or128_nouse_seq_cst:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl %esp, %eax
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 8(%ebp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __sync_fetch_and_or_16
-; X86-NEXT:    addl $20, %esp
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
-; X86-NEXT:    retl
+; X86-SSE2-LABEL: or128_nouse_seq_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl %esp, %eax
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    calll __sync_fetch_and_or_16
+; X86-SSE2-NEXT:    addl $20, %esp
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
+; X86-SSE2-NEXT:    retl
+;
+; X86-SLM-LABEL: or128_nouse_seq_cst:
+; X86-SLM:       # %bb.0:
+; X86-SLM-NEXT:    pushl %ebp
+; X86-SLM-NEXT:    .cfi_def_cfa_offset 8
+; X86-SLM-NEXT:    .cfi_offset %ebp, -8
+; X86-SLM-NEXT:    movl %esp, %ebp
+; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
+; X86-SLM-NEXT:    andl $-8, %esp
+; X86-SLM-NEXT:    subl $16, %esp
+; X86-SLM-NEXT:    movl 8(%ebp), %eax
+; X86-SLM-NEXT:    movl %esp, %ecx
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl $0
+; X86-SLM-NEXT:    pushl %eax
+; X86-SLM-NEXT:    pushl %ecx
+; X86-SLM-NEXT:    calll __sync_fetch_and_or_16
+; X86-SLM-NEXT:    addl $20, %esp
+; X86-SLM-NEXT:    movl %ebp, %esp
+; X86-SLM-NEXT:    popl %ebp
+; X86-SLM-NEXT:    .cfi_def_cfa %esp, 4
+; X86-SLM-NEXT:    retl
+;
+; X86-ATOM-LABEL: or128_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    pushl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
+; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
+; X86-ATOM-NEXT:    leal (%esp), %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
+; X86-ATOM-NEXT:    andl $-8, %esp
+; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    movl 8(%ebp), %eax
+; X86-ATOM-NEXT:    movl %esp, %ecx
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    pushl %ecx
+; X86-ATOM-NEXT:    calll __sync_fetch_and_or_16
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    movl %ebp, %esp
+; X86-ATOM-NEXT:    popl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
+; X86-ATOM-NEXT:    retl
   atomicrmw or i128* %p, i128 0 seq_cst
   ret void
 }
@@ -297,10 +594,21 @@ define void @or16_nouse_seq_cst(i16* %p) {
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: or16_nouse_seq_cst:
-; X86:       # %bb.0:
-; X86-NEXT:    lock orl $0, (%esp)
-; X86-NEXT:    retl
+; X86-GENERIC-LABEL: or16_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    lock orl $0, (%esp)
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or16_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   atomicrmw or i16* %p, i16 0 seq_cst
   ret void
 }
@@ -311,10 +619,21 @@ define void @or8_nouse_seq_cst(i8* %p) {
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: or8_nouse_seq_cst:
-; X86:       # %bb.0:
-; X86-NEXT:    lock orl $0, (%esp)
-; X86-NEXT:    retl
+; X86-GENERIC-LABEL: or8_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    lock orl $0, (%esp)
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or8_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   atomicrmw or i8* %p, i8 0 seq_cst
   ret void
 }