[llvm] [X86] Use fence(seq_cst) in IdempotentRMWIntoFencedLoad (PR #126521)

Valentin Churavy via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 6 07:28:04 PST 2025


https://github.com/vchuravy updated https://github.com/llvm/llvm-project/pull/126521

>From 93cb3f0d2c4dad09a39e3fcbdf999cf9e7e756b7 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Mon, 10 Feb 2025 15:19:20 +0100
Subject: [PATCH 1/4] [X86] Use fence(seq_cst) in IdempotentRMWIntoFencedLoad

This extends this optimization for scenarios where the subtarget
has `!hasMFence` or we have SyncScope SingleThread, by avoiding
the direct usage of `llvm.x64.sse2.mfence`.

Originally part of #106555
---
 llvm/lib/Target/X86/X86ISelLowering.cpp    | 17 +----
 llvm/test/CodeGen/X86/atomic-idempotent.ll | 86 ++++++++--------------
 2 files changed, 33 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fb799580f835a..1012dfb8be0fe 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31896,21 +31896,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
-  if (SSID == SyncScope::SingleThread)
-    // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
-    // the IR level, so we must wrap it in an intrinsic.
-    return nullptr;
-
-  if (!Subtarget.hasMFence())
-    // FIXME: it might make sense to use a locked operation here but on a
-    // different cache-line to prevent cache-line bouncing. In practice it
-    // is probably a small win, and x86 processors without mfence are rare
-    // enough that we do not bother.
-    return nullptr;
 
-  Function *MFence =
-      llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
-  Builder.CreateCall(MFence, {});
+  // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
+  // lowering for SSID == SyncScope::SingleThread and !hasMFence
+  Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
 
   // Finally we can emit the atomic load.
   LoadInst *Loaded = Builder.CreateAlignedLoad(
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index 55b4d1af094f6..10e8cfc0ad497 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
 ;
 ; X86-SLM-LABEL: add8:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    xorl %eax, %eax
-; X86-SLM-NEXT:    lock xaddb %al, (%ecx)
-; X86-SLM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movzbl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: add8:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    xorl %eax, %eax
-; X86-ATOM-NEXT:    lock xaddb %al, (%ecx)
-; X86-ATOM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movzbl (%eax), %eax
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
@@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
 ;
 ; X86-SLM-LABEL: or16:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movzwl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4
-; X86-SLM-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgw %ax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB1_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movzwl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or16:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movzwl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4
-; X86-ATOM-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgw %ax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB1_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movzwl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw or ptr %p, i16 0 acquire
   ret i16 %1
@@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
 ;
 ; X86-SLM-LABEL: xor32:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4
-; X86-SLM-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB2_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: xor32:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4
-; X86-ATOM-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB2_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw xor ptr %p, i32 0 release
   ret i32 %1
@@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
 ;
 ; X86-SLM-LABEL: and32:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4
-; X86-SLM-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB5_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: and32:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4
-; X86-ATOM-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB5_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw and ptr %p, i32 -1 acq_rel
   ret i32 %1

>From 52ff0b23cd1a7522a1a34d5d50796a6d51728528 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Tue, 11 Feb 2025 11:01:51 +0100
Subject: [PATCH 2/4] add test for syncscope

---
 .../X86/atomic-idempotent-syncscope.ll        | 654 ++++++++++++++++++
 1 file changed, 654 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll

diff --git a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
new file mode 100644
index 0000000000000..8e929713d1549
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
@@ -0,0 +1,654 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs                           | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs           -mattr=+sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SSE2
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=slm -mattr=-sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=goldmont -mattr=-sse2 | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=knl -mattr=-sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=atom -mattr=-sse2     | FileCheck %s --check-prefixes=X86,X86-ATOM
+
+; On x86, an atomic rmw operation that does not modify the value in memory
+; (such as atomic add 0) can be replaced by an mfence followed by a mov.
+; This is explained (with the motivation for such an optimization) in
+; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+
+define i8 @add8(ptr %p) {
+; X64-LABEL: add8:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock xaddb %al, (%rdi)
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: add8:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GENERIC-NEXT:    xorl %eax, %eax
+; X86-GENERIC-NEXT:    lock xaddb %al, (%ecx)
+; X86-GENERIC-NEXT:    # kill: def $al killed $al killed $eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: add8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    xorl %eax, %eax
+; X86-ATOM-NEXT:    lock xaddb %al, (%ecx)
+; X86-ATOM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  %1 = atomicrmw add ptr %p, i8 0 syncscope("singlethread") monotonic
+  ret i8 %1
+}
+
+define i16 @or16(ptr %p) {
+; X64-LABEL: or16:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB1_1: # %atomicrmw.start
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    lock cmpxchgw %ax, (%rdi)
+; X64-NEXT:    jne .LBB1_1
+; X64-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-NEXT:    retq
+;
+; X86-LABEL: or16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB1_1: # %atomicrmw.start
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    lock cmpxchgw %ax, (%ecx)
+; X86-NEXT:    jne .LBB1_1
+; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    retl
+  %1 = atomicrmw or ptr %p, i16 0 syncscope("singlethread") acquire
+  ret i16 %1
+}
+
+define i32 @xor32(ptr %p) {
+; X64-LABEL: xor32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB2_1: # %atomicrmw.start
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    lock cmpxchgl %eax, (%rdi)
+; X64-NEXT:    jne .LBB2_1
+; X64-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-NEXT:    retq
+;
+; X86-LABEL: xor32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB2_1: # %atomicrmw.start
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-NEXT:    jne .LBB2_1
+; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    retl
+  %1 = atomicrmw xor ptr %p, i32 0 syncscope("singlethread") release
+  ret i32 %1
+}
+
+define i64 @sub64(ptr %p) {
+; X64-LABEL: sub64:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    lock xaddq %rax, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: sub64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl 4(%esi), %edx
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB3_1: # %atomicrmw.start
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    lock cmpxchg8b (%esi)
+; X86-NEXT:    jne .LBB3_1
+; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %1 = atomicrmw sub ptr %p, i64 0 syncscope("singlethread") seq_cst
+  ret i64 %1
+}
+
+define i128 @or128(ptr %p) {
+; X64-LABEL: or128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or128:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    pushl %ebp
+; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
+; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
+; X86-GENERIC-NEXT:    movl %esp, %ebp
+; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
+; X86-GENERIC-NEXT:    pushl %ebx
+; X86-GENERIC-NEXT:    pushl %edi
+; X86-GENERIC-NEXT:    pushl %esi
+; X86-GENERIC-NEXT:    andl $-16, %esp
+; X86-GENERIC-NEXT:    subl $48, %esp
+; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
+; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
+; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
+; X86-GENERIC-NEXT:    movl 12(%ebp), %edi
+; X86-GENERIC-NEXT:    movl 12(%edi), %ecx
+; X86-GENERIC-NEXT:    movl 8(%edi), %edx
+; X86-GENERIC-NEXT:    movl (%edi), %ebx
+; X86-GENERIC-NEXT:    movl 4(%edi), %esi
+; X86-GENERIC-NEXT:    .p2align 4, 0x90
+; X86-GENERIC-NEXT:  .LBB4_1: # %atomicrmw.start
+; X86-GENERIC-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-GENERIC-NEXT:    movl %ebx, (%esp)
+; X86-GENERIC-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    pushl $0
+; X86-GENERIC-NEXT:    pushl $0
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    pushl %edi
+; X86-GENERIC-NEXT:    pushl $16
+; X86-GENERIC-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-GENERIC-NEXT:    addl $24, %esp
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-GENERIC-NEXT:    movl (%esp), %ebx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-GENERIC-NEXT:    testb %al, %al
+; X86-GENERIC-NEXT:    je .LBB4_1
+; X86-GENERIC-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-GENERIC-NEXT:    movl 8(%ebp), %eax
+; X86-GENERIC-NEXT:    movl %ebx, (%eax)
+; X86-GENERIC-NEXT:    movl %esi, 4(%eax)
+; X86-GENERIC-NEXT:    movl %edx, 8(%eax)
+; X86-GENERIC-NEXT:    movl %ecx, 12(%eax)
+; X86-GENERIC-NEXT:    leal -12(%ebp), %esp
+; X86-GENERIC-NEXT:    popl %esi
+; X86-GENERIC-NEXT:    popl %edi
+; X86-GENERIC-NEXT:    popl %ebx
+; X86-GENERIC-NEXT:    popl %ebp
+; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
+; X86-GENERIC-NEXT:    retl $4
+;
+; X86-ATOM-LABEL: or128:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    pushl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
+; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
+; X86-ATOM-NEXT:    movl %esp, %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
+; X86-ATOM-NEXT:    pushl %ebx
+; X86-ATOM-NEXT:    pushl %edi
+; X86-ATOM-NEXT:    pushl %esi
+; X86-ATOM-NEXT:    andl $-16, %esp
+; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    .cfi_offset %esi, -20
+; X86-ATOM-NEXT:    .cfi_offset %edi, -16
+; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
+; X86-ATOM-NEXT:    movl 12(%ebp), %edi
+; X86-ATOM-NEXT:    movl 12(%edi), %ecx
+; X86-ATOM-NEXT:    movl 8(%edi), %edx
+; X86-ATOM-NEXT:    movl (%edi), %esi
+; X86-ATOM-NEXT:    movl 4(%edi), %ebx
+; X86-ATOM-NEXT:    .p2align 4, 0x90
+; X86-ATOM-NEXT:  .LBB4_1: # %atomicrmw.start
+; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-ATOM-NEXT:    movl %esi, (%esp)
+; X86-ATOM-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    pushl %edi
+; X86-ATOM-NEXT:    pushl $16
+; X86-ATOM-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-ATOM-NEXT:    testb %al, %al
+; X86-ATOM-NEXT:    movl (%esp), %esi
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-ATOM-NEXT:    je .LBB4_1
+; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl 8(%ebp), %eax
+; X86-ATOM-NEXT:    movl %esi, (%eax)
+; X86-ATOM-NEXT:    movl %ebx, 4(%eax)
+; X86-ATOM-NEXT:    movl %edx, 8(%eax)
+; X86-ATOM-NEXT:    movl %ecx, 12(%eax)
+; X86-ATOM-NEXT:    leal -12(%ebp), %esp
+; X86-ATOM-NEXT:    popl %esi
+; X86-ATOM-NEXT:    popl %edi
+; X86-ATOM-NEXT:    popl %ebx
+; X86-ATOM-NEXT:    popl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
+; X86-ATOM-NEXT:    retl $4
+  %1 = atomicrmw or ptr %p, i128 0 syncscope("singlethread") monotonic
+  ret i128 %1
+}
+
+; For 'and', the idempotent value is (-1)
+define i32 @and32 (ptr %p) {
+; X64-LABEL: and32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB5_1: # %atomicrmw.start
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    lock cmpxchgl %eax, (%rdi)
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-NEXT:    retq
+;
+; X86-LABEL: and32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB5_1: # %atomicrmw.start
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    retl
+  %1 = atomicrmw and ptr %p, i32 -1 syncscope("singlethread") acq_rel
+  ret i32 %1
+}
+
+define void @or32_nouse_monotonic(ptr %p) {
+; X64-LABEL: or32_nouse_monotonic:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_monotonic:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_monotonic:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") monotonic
+  ret void
+}
+
+
+define void @or32_nouse_acquire(ptr %p) {
+; X64-LABEL: or32_nouse_acquire:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_acquire:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_acquire:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 acquire
+  ret void
+}
+
+define void @or32_nouse_release(ptr %p) {
+; X64-LABEL: or32_nouse_release:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_release:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_release:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") release
+  ret void
+}
+
+define void @or32_nouse_acq_rel(ptr %p) {
+; X64-LABEL: or32_nouse_acq_rel:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_acq_rel:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_acq_rel:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") acq_rel
+  ret void
+}
+
+define void @or32_nouse_seq_cst(ptr %p) {
+; X64-LABEL: or32_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
+define void @or64_nouse_seq_cst(ptr %p) {
+; X64-LABEL: or64_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-LABEL: or64_nouse_seq_cst:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl 4(%esi), %edx
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB11_1: # %atomicrmw.start
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    lock cmpxchg8b (%esi)
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  atomicrmw or ptr %p, i64 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+; TODO: Don't need to lower as sync_and_fetch call
+define void @or128_nouse_seq_cst(ptr %p) {
+; X64-LABEL: or128_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movl $5, %ecx
+; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or128_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    pushl %ebp
+; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
+; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
+; X86-GENERIC-NEXT:    movl %esp, %ebp
+; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
+; X86-GENERIC-NEXT:    pushl %ebx
+; X86-GENERIC-NEXT:    pushl %edi
+; X86-GENERIC-NEXT:    pushl %esi
+; X86-GENERIC-NEXT:    andl $-16, %esp
+; X86-GENERIC-NEXT:    subl $48, %esp
+; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
+; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
+; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
+; X86-GENERIC-NEXT:    movl 8(%ebp), %esi
+; X86-GENERIC-NEXT:    movl 12(%esi), %ecx
+; X86-GENERIC-NEXT:    movl 8(%esi), %edi
+; X86-GENERIC-NEXT:    movl (%esi), %edx
+; X86-GENERIC-NEXT:    movl 4(%esi), %ebx
+; X86-GENERIC-NEXT:    .p2align 4, 0x90
+; X86-GENERIC-NEXT:  .LBB12_1: # %atomicrmw.start
+; X86-GENERIC-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-GENERIC-NEXT:    movl %edx, (%esp)
+; X86-GENERIC-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    pushl $5
+; X86-GENERIC-NEXT:    pushl $5
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    pushl %esi
+; X86-GENERIC-NEXT:    pushl $16
+; X86-GENERIC-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-GENERIC-NEXT:    addl $24, %esp
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-GENERIC-NEXT:    movl (%esp), %edx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-GENERIC-NEXT:    testb %al, %al
+; X86-GENERIC-NEXT:    je .LBB12_1
+; X86-GENERIC-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-GENERIC-NEXT:    leal -12(%ebp), %esp
+; X86-GENERIC-NEXT:    popl %esi
+; X86-GENERIC-NEXT:    popl %edi
+; X86-GENERIC-NEXT:    popl %ebx
+; X86-GENERIC-NEXT:    popl %ebp
+; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or128_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    pushl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
+; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
+; X86-ATOM-NEXT:    movl %esp, %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
+; X86-ATOM-NEXT:    pushl %ebx
+; X86-ATOM-NEXT:    pushl %edi
+; X86-ATOM-NEXT:    pushl %esi
+; X86-ATOM-NEXT:    andl $-16, %esp
+; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    .cfi_offset %esi, -20
+; X86-ATOM-NEXT:    .cfi_offset %edi, -16
+; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
+; X86-ATOM-NEXT:    movl 8(%ebp), %esi
+; X86-ATOM-NEXT:    movl %esp, %ebx
+; X86-ATOM-NEXT:    movl 12(%esi), %ecx
+; X86-ATOM-NEXT:    movl 8(%esi), %edx
+; X86-ATOM-NEXT:    movl (%esi), %eax
+; X86-ATOM-NEXT:    movl 4(%esi), %edi
+; X86-ATOM-NEXT:    .p2align 4, 0x90
+; X86-ATOM-NEXT:  .LBB12_1: # %atomicrmw.start
+; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-ATOM-NEXT:    movl %eax, (%esp)
+; X86-ATOM-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    pushl $5
+; X86-ATOM-NEXT:    pushl $5
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    pushl %ebx
+; X86-ATOM-NEXT:    pushl %esi
+; X86-ATOM-NEXT:    pushl $16
+; X86-ATOM-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    testb %al, %al
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-ATOM-NEXT:    movl (%esp), %eax
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-ATOM-NEXT:    je .LBB12_1
+; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    leal -12(%ebp), %esp
+; X86-ATOM-NEXT:    popl %esi
+; X86-ATOM-NEXT:    popl %edi
+; X86-ATOM-NEXT:    popl %ebx
+; X86-ATOM-NEXT:    popl %ebp
+; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i128 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+
+define void @or16_nouse_seq_cst(ptr %p) {
+; X64-LABEL: or16_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or16_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or16_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i16 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+define void @or8_nouse_seq_cst(ptr %p) {
+; X64-LABEL: or8_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or8_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or8_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i8 0 syncscope("singlethread") seq_cst
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X86-SLM: {{.*}}
+; X86-SSE2: {{.*}}

>From 2357c81b0713add3927d6357f7df2baefbeafa8a Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Tue, 11 Feb 2025 11:03:24 +0100
Subject: [PATCH 3/4] update test with patch

---
 .../X86/atomic-idempotent-syncscope.ll        | 142 +++++++++---------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
index 8e929713d1549..73f545b6a77c2 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
@@ -14,25 +14,24 @@
 define i8 @add8(ptr %p) {
 ; X64-LABEL: add8:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    lock xaddb %al, (%rdi)
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    movzbl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: add8:
 ; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-GENERIC-NEXT:    xorl %eax, %eax
-; X86-GENERIC-NEXT:    lock xaddb %al, (%ecx)
-; X86-GENERIC-NEXT:    # kill: def $al killed $al killed $eax
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movzbl (%eax), %eax
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: add8:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    xorl %eax, %eax
-; X86-ATOM-NEXT:    lock xaddb %al, (%ecx)
-; X86-ATOM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movzbl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
@@ -43,26 +42,27 @@ define i8 @add8(ptr %p) {
 define i16 @or16(ptr %p) {
 ; X64-LABEL: or16:
 ; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
 ; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB1_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    lock cmpxchgw %ax, (%rdi)
-; X64-NEXT:    jne .LBB1_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: or16:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    lock cmpxchgw %ax, (%ecx)
-; X86-NEXT:    jne .LBB1_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    retl
+; X86-GENERIC-LABEL: or16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movzwl (%eax), %eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movzwl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   %1 = atomicrmw or ptr %p, i16 0 syncscope("singlethread") acquire
   ret i16 %1
 }
@@ -70,26 +70,27 @@ define i16 @or16(ptr %p) {
 define i32 @xor32(ptr %p) {
 ; X64-LABEL: xor32:
 ; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
 ; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB2_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    lock cmpxchgl %eax, (%rdi)
-; X64-NEXT:    jne .LBB2_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: xor32:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-NEXT:    jne .LBB2_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    retl
+; X86-GENERIC-LABEL: xor32:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movl (%eax), %eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: xor32:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   %1 = atomicrmw xor ptr %p, i32 0 syncscope("singlethread") release
   ret i32 %1
 }
@@ -97,8 +98,8 @@ define i32 @xor32(ptr %p) {
 define i64 @sub64(ptr %p) {
 ; X64-LABEL: sub64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    lock xaddq %rax, (%rdi)
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    movq (%rdi), %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: sub64:
@@ -112,7 +113,7 @@ define i64 @sub64(ptr %p) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
-; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %edx, %ecx
@@ -162,7 +163,7 @@ define i128 @or128(ptr %p) {
 ; X86-GENERIC-NEXT:    movl 8(%edi), %edx
 ; X86-GENERIC-NEXT:    movl (%edi), %ebx
 ; X86-GENERIC-NEXT:    movl 4(%edi), %esi
-; X86-GENERIC-NEXT:    .p2align 4, 0x90
+; X86-GENERIC-NEXT:    .p2align 4
 ; X86-GENERIC-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X86-GENERIC-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-GENERIC-NEXT:    movl %ebx, (%esp)
@@ -223,7 +224,7 @@ define i128 @or128(ptr %p) {
 ; X86-ATOM-NEXT:    movl 8(%edi), %edx
 ; X86-ATOM-NEXT:    movl (%edi), %esi
 ; X86-ATOM-NEXT:    movl 4(%edi), %ebx
-; X86-ATOM-NEXT:    .p2align 4, 0x90
+; X86-ATOM-NEXT:    .p2align 4
 ; X86-ATOM-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-ATOM-NEXT:    movl %esi, (%esp)
@@ -271,26 +272,27 @@ define i128 @or128(ptr %p) {
 define i32 @and32 (ptr %p) {
 ; X64-LABEL: and32:
 ; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
 ; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB5_1: # %atomicrmw.start
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    lock cmpxchgl %eax, (%rdi)
-; X64-NEXT:    jne .LBB5_1
-; X64-NEXT:  # %bb.2: # %atomicrmw.end
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: and32:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-NEXT:    jne .LBB5_1
-; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    retl
+; X86-GENERIC-LABEL: and32:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movl (%eax), %eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: and32:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
   %1 = atomicrmw and ptr %p, i32 -1 syncscope("singlethread") acq_rel
   ret i32 %1
 }
@@ -449,7 +451,7 @@ define void @or64_nouse_seq_cst(ptr %p) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
-; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB11_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %edx, %ecx
@@ -500,7 +502,7 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-GENERIC-NEXT:    movl 8(%esi), %edi
 ; X86-GENERIC-NEXT:    movl (%esi), %edx
 ; X86-GENERIC-NEXT:    movl 4(%esi), %ebx
-; X86-GENERIC-NEXT:    .p2align 4, 0x90
+; X86-GENERIC-NEXT:    .p2align 4
 ; X86-GENERIC-NEXT:  .LBB12_1: # %atomicrmw.start
 ; X86-GENERIC-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-GENERIC-NEXT:    movl %edx, (%esp)
@@ -557,7 +559,7 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-ATOM-NEXT:    movl 8(%esi), %edx
 ; X86-ATOM-NEXT:    movl (%esi), %eax
 ; X86-ATOM-NEXT:    movl 4(%esi), %edi
-; X86-ATOM-NEXT:    .p2align 4, 0x90
+; X86-ATOM-NEXT:    .p2align 4
 ; X86-ATOM-NEXT:  .LBB12_1: # %atomicrmw.start
 ; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-ATOM-NEXT:    movl %eax, (%esp)

>From af8f6d47e2f1da6978b3e5dc5a171fe8b68882b5 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Thu, 6 Mar 2025 16:27:48 +0100
Subject: [PATCH 4/4] mark tests as nounwind

---
 .../X86/atomic-idempotent-syncscope.ll        | 77 +++++--------------
 llvm/test/CodeGen/X86/atomic-idempotent.ll    | 76 ++++--------------
 2 files changed, 35 insertions(+), 118 deletions(-)

diff --git a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
index 73f545b6a77c2..9e20fdb59f552 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
@@ -11,7 +11,7 @@
 ; This is explained (with the motivation for such an optimization) in
 ; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
 
-define i8 @add8(ptr %p) {
+define i8 @add8(ptr %p) #0 {
 ; X64-LABEL: add8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -39,7 +39,7 @@ define i8 @add8(ptr %p) {
   ret i8 %1
 }
 
-define i16 @or16(ptr %p) {
+define i16 @or16(ptr %p) #0 {
 ; X64-LABEL: or16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -67,7 +67,7 @@ define i16 @or16(ptr %p) {
   ret i16 %1
 }
 
-define i32 @xor32(ptr %p) {
+define i32 @xor32(ptr %p) #0 {
 ; X64-LABEL: xor32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -95,7 +95,7 @@ define i32 @xor32(ptr %p) {
   ret i32 %1
 }
 
-define i64 @sub64(ptr %p) {
+define i64 @sub64(ptr %p) #0 {
 ; X64-LABEL: sub64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -105,11 +105,7 @@ define i64 @sub64(ptr %p) {
 ; X86-LABEL: sub64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
@@ -122,42 +118,32 @@ define i64 @sub64(ptr %p) {
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = atomicrmw sub ptr %p, i64 0 syncscope("singlethread") seq_cst
   ret i64 %1
 }
 
-define i128 @or128(ptr %p) {
+define i128 @or128(ptr %p) #0 {
 ; X64-LABEL: or128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
 ; X64-NEXT:    popq %rcx
-; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or128:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    pushl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
-; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
 ; X86-GENERIC-NEXT:    movl %esp, %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-GENERIC-NEXT:    pushl %ebx
 ; X86-GENERIC-NEXT:    pushl %edi
 ; X86-GENERIC-NEXT:    pushl %esi
 ; X86-GENERIC-NEXT:    andl $-16, %esp
 ; X86-GENERIC-NEXT:    subl $48, %esp
-; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
-; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
-; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
 ; X86-GENERIC-NEXT:    movl 12(%ebp), %edi
 ; X86-GENERIC-NEXT:    movl 12(%edi), %ecx
 ; X86-GENERIC-NEXT:    movl 8(%edi), %edx
@@ -201,24 +187,17 @@ define i128 @or128(ptr %p) {
 ; X86-GENERIC-NEXT:    popl %edi
 ; X86-GENERIC-NEXT:    popl %ebx
 ; X86-GENERIC-NEXT:    popl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-GENERIC-NEXT:    retl $4
 ;
 ; X86-ATOM-LABEL: or128:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
-; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    movl %esp, %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %ebx
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
 ; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
-; X86-ATOM-NEXT:    .cfi_offset %esi, -20
-; X86-ATOM-NEXT:    .cfi_offset %edi, -16
-; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
 ; X86-ATOM-NEXT:    movl 12(%ebp), %edi
 ; X86-ATOM-NEXT:    movl 12(%edi), %ecx
 ; X86-ATOM-NEXT:    movl 8(%edi), %edx
@@ -262,14 +241,13 @@ define i128 @or128(ptr %p) {
 ; X86-ATOM-NEXT:    popl %edi
 ; X86-ATOM-NEXT:    popl %ebx
 ; X86-ATOM-NEXT:    popl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl $4
   %1 = atomicrmw or ptr %p, i128 0 syncscope("singlethread") monotonic
   ret i128 %1
 }
 
 ; For 'and', the idempotent value is (-1)
-define i32 @and32 (ptr %p) {
+define i32 @and32 (ptr %p) #0 {
 ; X64-LABEL: and32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -297,7 +275,7 @@ define i32 @and32 (ptr %p) {
   ret i32 %1
 }
 
-define void @or32_nouse_monotonic(ptr %p) {
+define void @or32_nouse_monotonic(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_monotonic:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -325,7 +303,7 @@ define void @or32_nouse_monotonic(ptr %p) {
 }
 
 
-define void @or32_nouse_acquire(ptr %p) {
+define void @or32_nouse_acquire(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_acquire:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -352,7 +330,7 @@ define void @or32_nouse_acquire(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_release(ptr %p) {
+define void @or32_nouse_release(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_release:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -379,7 +357,7 @@ define void @or32_nouse_release(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_acq_rel(ptr %p) {
+define void @or32_nouse_acq_rel(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_acq_rel:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -406,7 +384,7 @@ define void @or32_nouse_acq_rel(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_seq_cst(ptr %p) {
+define void @or32_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -434,7 +412,7 @@ define void @or32_nouse_seq_cst(ptr %p) {
 }
 
 ; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
-define void @or64_nouse_seq_cst(ptr %p) {
+define void @or64_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or64_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -443,11 +421,7 @@ define void @or64_nouse_seq_cst(ptr %p) {
 ; X86-LABEL: or64_nouse_seq_cst:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
@@ -460,43 +434,33 @@ define void @or64_nouse_seq_cst(ptr %p) {
 ; X86-NEXT:    jne .LBB11_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   atomicrmw or ptr %p, i64 0 syncscope("singlethread") seq_cst
   ret void
 }
 
 ; TODO: Don't need to lower as sync_and_fetch call
-define void @or128_nouse_seq_cst(ptr %p) {
+define void @or128_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or128_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movl $5, %ecx
 ; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
 ; X64-NEXT:    popq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or128_nouse_seq_cst:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    pushl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
-; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
 ; X86-GENERIC-NEXT:    movl %esp, %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-GENERIC-NEXT:    pushl %ebx
 ; X86-GENERIC-NEXT:    pushl %edi
 ; X86-GENERIC-NEXT:    pushl %esi
 ; X86-GENERIC-NEXT:    andl $-16, %esp
 ; X86-GENERIC-NEXT:    subl $48, %esp
-; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
-; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
-; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
 ; X86-GENERIC-NEXT:    movl 8(%ebp), %esi
 ; X86-GENERIC-NEXT:    movl 12(%esi), %ecx
 ; X86-GENERIC-NEXT:    movl 8(%esi), %edi
@@ -535,24 +499,17 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-GENERIC-NEXT:    popl %edi
 ; X86-GENERIC-NEXT:    popl %ebx
 ; X86-GENERIC-NEXT:    popl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or128_nouse_seq_cst:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
-; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    movl %esp, %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %ebx
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
 ; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
-; X86-ATOM-NEXT:    .cfi_offset %esi, -20
-; X86-ATOM-NEXT:    .cfi_offset %edi, -16
-; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
 ; X86-ATOM-NEXT:    movl 8(%ebp), %esi
 ; X86-ATOM-NEXT:    movl %esp, %ebx
 ; X86-ATOM-NEXT:    movl 12(%esi), %ecx
@@ -591,14 +548,13 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-ATOM-NEXT:    popl %edi
 ; X86-ATOM-NEXT:    popl %ebx
 ; X86-ATOM-NEXT:    popl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl
   atomicrmw or ptr %p, i128 0 syncscope("singlethread") seq_cst
   ret void
 }
 
 
-define void @or16_nouse_seq_cst(ptr %p) {
+define void @or16_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or16_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -625,7 +581,7 @@ define void @or16_nouse_seq_cst(ptr %p) {
   ret void
 }
 
-define void @or8_nouse_seq_cst(ptr %p) {
+define void @or8_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or8_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -651,6 +607,9 @@ define void @or8_nouse_seq_cst(ptr %p) {
   atomicrmw or ptr %p, i8 0 syncscope("singlethread") seq_cst
   ret void
 }
+
+attributes #0 = { nounwind }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; X86-SLM: {{.*}}
 ; X86-SSE2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index 10e8cfc0ad497..91355bd64cade 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -11,7 +11,7 @@
 ; This is explained (with the motivation for such an optimization) in
 ; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
 
-define i8 @add8(ptr %p) {
+define i8 @add8(ptr %p) #0 {
 ; X64-LABEL: add8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -44,7 +44,7 @@ define i8 @add8(ptr %p) {
   ret i8 %1
 }
 
-define i16 @or16(ptr %p) {
+define i16 @or16(ptr %p) #0 {
 ; X64-LABEL: or16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -77,7 +77,7 @@ define i16 @or16(ptr %p) {
   ret i16 %1
 }
 
-define i32 @xor32(ptr %p) {
+define i32 @xor32(ptr %p) #0 {
 ; X64-LABEL: xor32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -110,7 +110,7 @@ define i32 @xor32(ptr %p) {
   ret i32 %1
 }
 
-define i64 @sub64(ptr %p) {
+define i64 @sub64(ptr %p) #0 {
 ; X64-LABEL: sub64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -120,11 +120,7 @@ define i64 @sub64(ptr %p) {
 ; X86-LABEL: sub64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
@@ -137,42 +133,32 @@ define i64 @sub64(ptr %p) {
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = atomicrmw sub ptr %p, i64 0 seq_cst
   ret i64 %1
 }
 
-define i128 @or128(ptr %p) {
+define i128 @or128(ptr %p) #0 {
 ; X64-LABEL: or128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
 ; X64-NEXT:    popq %rcx
-; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or128:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    pushl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
-; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
 ; X86-GENERIC-NEXT:    movl %esp, %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-GENERIC-NEXT:    pushl %ebx
 ; X86-GENERIC-NEXT:    pushl %edi
 ; X86-GENERIC-NEXT:    pushl %esi
 ; X86-GENERIC-NEXT:    andl $-16, %esp
 ; X86-GENERIC-NEXT:    subl $48, %esp
-; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
-; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
-; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
 ; X86-GENERIC-NEXT:    movl 12(%ebp), %edi
 ; X86-GENERIC-NEXT:    movl 12(%edi), %ecx
 ; X86-GENERIC-NEXT:    movl 8(%edi), %edx
@@ -216,24 +202,17 @@ define i128 @or128(ptr %p) {
 ; X86-GENERIC-NEXT:    popl %edi
 ; X86-GENERIC-NEXT:    popl %ebx
 ; X86-GENERIC-NEXT:    popl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-GENERIC-NEXT:    retl $4
 ;
 ; X86-ATOM-LABEL: or128:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
-; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    movl %esp, %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %ebx
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
 ; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
-; X86-ATOM-NEXT:    .cfi_offset %esi, -20
-; X86-ATOM-NEXT:    .cfi_offset %edi, -16
-; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
 ; X86-ATOM-NEXT:    movl 12(%ebp), %edi
 ; X86-ATOM-NEXT:    movl 12(%edi), %ecx
 ; X86-ATOM-NEXT:    movl 8(%edi), %edx
@@ -277,14 +256,13 @@ define i128 @or128(ptr %p) {
 ; X86-ATOM-NEXT:    popl %edi
 ; X86-ATOM-NEXT:    popl %ebx
 ; X86-ATOM-NEXT:    popl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl $4
   %1 = atomicrmw or ptr %p, i128 0 monotonic
   ret i128 %1
 }
 
 ; For 'and', the idempotent value is (-1)
-define i32 @and32 (ptr %p) {
+define i32 @and32 (ptr %p) #0 {
 ; X64-LABEL: and32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -317,7 +295,7 @@ define i32 @and32 (ptr %p) {
   ret i32 %1
 }
 
-define void @or32_nouse_monotonic(ptr %p) {
+define void @or32_nouse_monotonic(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_monotonic:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -345,7 +323,7 @@ define void @or32_nouse_monotonic(ptr %p) {
 }
 
 
-define void @or32_nouse_acquire(ptr %p) {
+define void @or32_nouse_acquire(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_acquire:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -372,7 +350,7 @@ define void @or32_nouse_acquire(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_release(ptr %p) {
+define void @or32_nouse_release(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_release:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -399,7 +377,7 @@ define void @or32_nouse_release(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_acq_rel(ptr %p) {
+define void @or32_nouse_acq_rel(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_acq_rel:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -426,7 +404,7 @@ define void @or32_nouse_acq_rel(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_seq_cst(ptr %p) {
+define void @or32_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -452,7 +430,7 @@ define void @or32_nouse_seq_cst(ptr %p) {
 }
 
 ; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
-define void @or64_nouse_seq_cst(ptr %p) {
+define void @or64_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or64_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -461,11 +439,7 @@ define void @or64_nouse_seq_cst(ptr %p) {
 ; X86-LABEL: or64_nouse_seq_cst:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
@@ -478,43 +452,33 @@ define void @or64_nouse_seq_cst(ptr %p) {
 ; X86-NEXT:    jne .LBB11_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   atomicrmw or ptr %p, i64 0 seq_cst
   ret void
 }
 
 ; TODO: Don't need to lower as sync_and_fetch call
-define void @or128_nouse_seq_cst(ptr %p) {
+define void @or128_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or128_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movl $5, %ecx
 ; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
 ; X64-NEXT:    popq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or128_nouse_seq_cst:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    pushl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
-; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
 ; X86-GENERIC-NEXT:    movl %esp, %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-GENERIC-NEXT:    pushl %ebx
 ; X86-GENERIC-NEXT:    pushl %edi
 ; X86-GENERIC-NEXT:    pushl %esi
 ; X86-GENERIC-NEXT:    andl $-16, %esp
 ; X86-GENERIC-NEXT:    subl $48, %esp
-; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
-; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
-; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
 ; X86-GENERIC-NEXT:    movl 8(%ebp), %esi
 ; X86-GENERIC-NEXT:    movl 12(%esi), %ecx
 ; X86-GENERIC-NEXT:    movl 8(%esi), %edi
@@ -553,24 +517,17 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-GENERIC-NEXT:    popl %edi
 ; X86-GENERIC-NEXT:    popl %ebx
 ; X86-GENERIC-NEXT:    popl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or128_nouse_seq_cst:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
-; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    movl %esp, %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %ebx
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
 ; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
-; X86-ATOM-NEXT:    .cfi_offset %esi, -20
-; X86-ATOM-NEXT:    .cfi_offset %edi, -16
-; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
 ; X86-ATOM-NEXT:    movl 8(%ebp), %esi
 ; X86-ATOM-NEXT:    movl %esp, %ebx
 ; X86-ATOM-NEXT:    movl 12(%esi), %ecx
@@ -609,14 +566,13 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-ATOM-NEXT:    popl %edi
 ; X86-ATOM-NEXT:    popl %ebx
 ; X86-ATOM-NEXT:    popl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl
   atomicrmw or ptr %p, i128 0 seq_cst
   ret void
 }
 
 
-define void @or16_nouse_seq_cst(ptr %p) {
+define void @or16_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or16_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -641,7 +597,7 @@ define void @or16_nouse_seq_cst(ptr %p) {
   ret void
 }
 
-define void @or8_nouse_seq_cst(ptr %p) {
+define void @or8_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or8_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -665,3 +621,5 @@ define void @or8_nouse_seq_cst(ptr %p) {
   atomicrmw or ptr %p, i8 0 seq_cst
   ret void
 }
+
+attributes #0 = { nounwind }



More information about the llvm-commits mailing list