[llvm] a8031c9 - [AArch64] Add TypePromotion tests and regenerate atomic test check lines

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 29 06:35:16 PDT 2021


Author: David Green
Date: 2021-09-29T14:35:09+01:00
New Revision: a8031c92072137d7e9290511040a0d9c267bd461

URL: https://github.com/llvm/llvm-project/commit/a8031c92072137d7e9290511040a0d9c267bd461
DIFF: https://github.com/llvm/llvm-project/commit/a8031c92072137d7e9290511040a0d9c267bd461.diff

LOG: [AArch64] Add TypePromotion tests and regenerate atomic test check lines

This adds some extra tests for TypePromotion as per D110239, and
regenerated the check lines in atomic-ops.ll and cmpxchg-idions.ll to be
more easy to maintain with changing codegen (hopefully in a way that
does not reduce what is tested).

Added: 
    llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
    llvm/test/CodeGen/AArch64/typepromotion-phisret.ll
    llvm/test/CodeGen/AArch64/typepromotion-signed.ll

Modified: 
    llvm/test/CodeGen/AArch64/atomic-ops.ll
    llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
    llvm/test/CodeGen/AArch64/signed-truncation-check.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/atomic-ops.ll b/llvm/test/CodeGen/AArch64/atomic-ops.ll
index 07e27b09af11..e1fdf4c16f45 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,12 +1,6 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE_ATOMICS
-
-
-; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
-; (i.e. reusing a register for status & data in store exclusive).
-; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], w[[NEW]], [x{{[0-9]+}}]
-; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], x[[NEW]], [x{{[0-9]+}}]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,INLINE_ATOMICS
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefixes=CHECK,OUTLINE_ATOMICS
 
 @var8 = dso_local global i8 0
 @var16 = dso_local global i16 0
@@ -14,7 +8,20 @@
 @var64 = dso_local global i64 0
 
 define dso_local i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i8:
+; INLINE_ATOMICS-LABEL: test_atomic_load_add_i8:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var8
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; INLINE_ATOMICS-NEXT:  .LBB0_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxrb w8, [x9]
+; INLINE_ATOMICS-NEXT:    add w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB0_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_add_i8:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -24,25 +31,24 @@ define dso_local i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw add i8* @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i16:
+; INLINE_ATOMICS-LABEL: test_atomic_load_add_i16:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var16
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; INLINE_ATOMICS-NEXT:  .LBB1_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxrh w8, [x9]
+; INLINE_ATOMICS-NEXT:    add w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stxrh w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB1_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_add_i16:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -52,25 +58,24 @@ define dso_local i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw add i16* @var16, i16 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i32:
+; INLINE_ATOMICS-LABEL: test_atomic_load_add_i32:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var32
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
+; INLINE_ATOMICS-NEXT:  .LBB2_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxr w8, [x9]
+; INLINE_ATOMICS-NEXT:    add w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxr w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB2_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_add_i32:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -80,25 +85,24 @@ define dso_local i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw add i32* @var32, i32 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_add_i64:
+; INLINE_ATOMICS-LABEL: test_atomic_load_add_i64:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
+; INLINE_ATOMICS-NEXT:  .LBB3_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxr x8, [x9]
+; INLINE_ATOMICS-NEXT:    add x10, x8, x0
+; INLINE_ATOMICS-NEXT:    stxr w11, x10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB3_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov x0, x8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_add_i64:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -108,25 +112,24 @@ define dso_local i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw add i64* @var64, i64 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: add [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i8:
+; INLINE_ATOMICS-LABEL: test_atomic_load_sub_i8:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var8
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; INLINE_ATOMICS-NEXT:  .LBB4_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxrb w8, [x9]
+; INLINE_ATOMICS-NEXT:    sub w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stxrb w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB4_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_sub_i8:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -137,25 +140,24 @@ define dso_local i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw sub i8* @var8, i8 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i16:
+; INLINE_ATOMICS-LABEL: test_atomic_load_sub_i16:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var16
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; INLINE_ATOMICS-NEXT:  .LBB5_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxrh w8, [x9]
+; INLINE_ATOMICS-NEXT:    sub w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB5_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_sub_i16:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -166,25 +168,24 @@ define dso_local i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw sub i16* @var16, i16 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i32:
+; INLINE_ATOMICS-LABEL: test_atomic_load_sub_i32:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var32
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
+; INLINE_ATOMICS-NEXT:  .LBB6_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxr w8, [x9]
+; INLINE_ATOMICS-NEXT:    sub w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stxr w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB6_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_sub_i32:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -195,25 +196,24 @@ define dso_local i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw sub i32* @var32, i32 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_sub_i64:
+; INLINE_ATOMICS-LABEL: test_atomic_load_sub_i64:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
+; INLINE_ATOMICS-NEXT:  .LBB7_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxr x8, [x9]
+; INLINE_ATOMICS-NEXT:    sub x10, x8, x0
+; INLINE_ATOMICS-NEXT:    stlxr w11, x10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB7_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov x0, x8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_sub_i64:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -224,25 +224,24 @@ define dso_local i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: sub [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i8:
+; INLINE_ATOMICS-LABEL: test_atomic_load_and_i8:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var8
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; INLINE_ATOMICS-NEXT:  .LBB8_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxrb w8, [x9]
+; INLINE_ATOMICS-NEXT:    and w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB8_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i8:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -253,25 +252,24 @@ define dso_local i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw and i8* @var8, i8 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i16:
+; INLINE_ATOMICS-LABEL: test_atomic_load_and_i16:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var16
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; INLINE_ATOMICS-NEXT:  .LBB9_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxrh w8, [x9]
+; INLINE_ATOMICS-NEXT:    and w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stxrh w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB9_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i16:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -282,25 +280,24 @@ define dso_local i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw and i16* @var16, i16 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i32:
+; INLINE_ATOMICS-LABEL: test_atomic_load_and_i32:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var32
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
+; INLINE_ATOMICS-NEXT:  .LBB10_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxr w8, [x9]
+; INLINE_ATOMICS-NEXT:    and w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxr w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB10_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i32:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -311,25 +308,24 @@ define dso_local i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw and i32* @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_and_i64:
+; INLINE_ATOMICS-LABEL: test_atomic_load_and_i64:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
+; INLINE_ATOMICS-NEXT:  .LBB11_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxr x8, [x9]
+; INLINE_ATOMICS-NEXT:    and x10, x8, x0
+; INLINE_ATOMICS-NEXT:    stxr w11, x10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB11_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov x0, x8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i64:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -340,25 +336,24 @@ define dso_local i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw and i64* @var64, i64 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: and [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i8:
+; INLINE_ATOMICS-LABEL: test_atomic_load_or_i8:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var8
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; INLINE_ATOMICS-NEXT:  .LBB12_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxrb w8, [x9]
+; INLINE_ATOMICS-NEXT:    orr w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB12_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_or_i8:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -368,25 +363,24 @@ define dso_local i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw or i8* @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i16:
+; INLINE_ATOMICS-LABEL: test_atomic_load_or_i16:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var16
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; INLINE_ATOMICS-NEXT:  .LBB13_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxrh w8, [x9]
+; INLINE_ATOMICS-NEXT:    orr w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stxrh w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB13_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_or_i16:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -396,25 +390,24 @@ define dso_local i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw or i16* @var16, i16 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i32:
+; INLINE_ATOMICS-LABEL: test_atomic_load_or_i32:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var32
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
+; INLINE_ATOMICS-NEXT:  .LBB14_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxr w8, [x9]
+; INLINE_ATOMICS-NEXT:    orr w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stxr w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB14_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_or_i32:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -424,25 +417,24 @@ define dso_local i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw or i32* @var32, i32 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_or_i64:
+; INLINE_ATOMICS-LABEL: test_atomic_load_or_i64:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
+; INLINE_ATOMICS-NEXT:  .LBB15_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxr x8, [x9]
+; INLINE_ATOMICS-NEXT:    orr x10, x8, x0
+; INLINE_ATOMICS-NEXT:    stlxr w11, x10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB15_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov x0, x8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_or_i64:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -452,25 +444,24 @@ define dso_local i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw or i64* @var64, i64 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: orr [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i8:
+; INLINE_ATOMICS-LABEL: test_atomic_load_xor_i8:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var8
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; INLINE_ATOMICS-NEXT:  .LBB16_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxrb w8, [x9]
+; INLINE_ATOMICS-NEXT:    eor w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stxrb w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB16_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xor_i8:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -480,25 +471,24 @@ define dso_local i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xor i8* @var8, i8 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i16:
+; INLINE_ATOMICS-LABEL: test_atomic_load_xor_i16:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var16
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; INLINE_ATOMICS-NEXT:  .LBB17_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxrh w8, [x9]
+; INLINE_ATOMICS-NEXT:    eor w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB17_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xor_i16:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -508,25 +498,24 @@ define dso_local i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xor i16* @var16, i16 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i32:
+; INLINE_ATOMICS-LABEL: test_atomic_load_xor_i32:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var32
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
+; INLINE_ATOMICS-NEXT:  .LBB18_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxr w8, [x9]
+; INLINE_ATOMICS-NEXT:    eor w10, w8, w0
+; INLINE_ATOMICS-NEXT:    stlxr w11, w10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB18_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xor_i32:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -536,25 +525,24 @@ define dso_local i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xor i32* @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xor_i64:
+; INLINE_ATOMICS-LABEL: test_atomic_load_xor_i64:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
+; INLINE_ATOMICS-NEXT:  .LBB19_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxr x8, [x9]
+; INLINE_ATOMICS-NEXT:    eor x10, x8, x0
+; INLINE_ATOMICS-NEXT:    stxr w11, x10, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w11, .LBB19_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov x0, x8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xor_i64:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -564,25 +552,24 @@ define dso_local i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xor i64* @var64, i64 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: eor [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xchg_i8:
+; INLINE_ATOMICS-LABEL: test_atomic_load_xchg_i8:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    // kill: def $w0 killed $w0 def $x0
+; INLINE_ATOMICS-NEXT:    adrp x9, var8
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; INLINE_ATOMICS-NEXT:  .LBB20_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxrb w8, [x9]
+; INLINE_ATOMICS-NEXT:    stxrb w10, w0, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB20_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xchg_i8:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -592,24 +579,24 @@ define dso_local i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xchg i8* @var8, i8 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xchg_i16:
+; INLINE_ATOMICS-LABEL: test_atomic_load_xchg_i16:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    // kill: def $w0 killed $w0 def $x0
+; INLINE_ATOMICS-NEXT:    adrp x9, var16
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; INLINE_ATOMICS-NEXT:  .LBB21_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxrh w8, [x9]
+; INLINE_ATOMICS-NEXT:    stlxrh w10, w0, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB21_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xchg_i16:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -619,25 +606,24 @@ define dso_local i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xchg_i32:
-; CHECK: mov {{[xw]}}8, w[[OLD:[0-9]+]]
+; INLINE_ATOMICS-LABEL: test_atomic_load_xchg_i32:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    mov w8, w0
+; INLINE_ATOMICS-NEXT:    adrp x9, var32
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
+; INLINE_ATOMICS-NEXT:  .LBB22_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxr w0, [x9]
+; INLINE_ATOMICS-NEXT:    stlxr w10, w8, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB22_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xchg_i32:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -647,22 +633,23 @@ define dso_local i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xchg i32* @var32, i32 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxr {{[xw]}}[[OLD]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w8, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
-; CHECK-LABEL: test_atomic_load_xchg_i64:
+; INLINE_ATOMICS-LABEL: test_atomic_load_xchg_i64:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
+; INLINE_ATOMICS-NEXT:  .LBB23_1: // %atomicrmw.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxr x8, [x9]
+; INLINE_ATOMICS-NEXT:    stxr w10, x0, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB23_1
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
+; INLINE_ATOMICS-NEXT:    mov x0, x8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_load_xchg_i64:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -672,640 +659,341 @@ define dso_local i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; OUTLINE_ATOMICS-NEXT:    ret
    %old = atomicrmw xchg i64* @var64, i64 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], x0, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 
 define dso_local i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_min_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:  .LBB24_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxrb w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    sxtb w8, w10
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, sxtb
-; OUTLINE_ATOMICS-NEXT:    csel w10, w10, w0, le
-; OUTLINE_ATOMICS-NEXT:    stxrb w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB24_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var8
+; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:  .LBB24_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrb w10, [x9]
+; CHECK-NEXT:    sxtb w8, w10
+; CHECK-NEXT:    cmp w8, w0, sxtb
+; CHECK-NEXT:    csel w10, w10, w0, le
+; CHECK-NEXT:    stxrb w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB24_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw min i8* @var8, i8 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
-
-; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i16:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_min_i16:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
-; OUTLINE_ATOMICS-NEXT:  .LBB25_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldxrh w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    sxth w8, w10
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, sxth
-; OUTLINE_ATOMICS-NEXT:    csel w10, w10, w0, le
-; OUTLINE_ATOMICS-NEXT:    stlxrh w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB25_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var16
+; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:  .LBB25_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldxrh w10, [x9]
+; CHECK-NEXT:    sxth w8, w10
+; CHECK-NEXT:    cmp w8, w0, sxth
+; CHECK-NEXT:    csel w10, w10, w0, le
+; CHECK-NEXT:    stlxrh w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB25_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw min i16* @var16, i16 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
-
-
-; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i32:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_min_i32:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var32
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
-; OUTLINE_ATOMICS-NEXT:  .LBB26_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldxr w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, le
-; OUTLINE_ATOMICS-NEXT:    stxr w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB26_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var32
+; CHECK-NEXT:    add x9, x9, :lo12:var32
+; CHECK-NEXT:  .LBB26_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldxr w8, [x9]
+; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    csel w10, w8, w0, le
+; CHECK-NEXT:    stxr w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB26_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw min i32* @var32, i32 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
-
-
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i64:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_min_i64:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var64
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
-; OUTLINE_ATOMICS-NEXT:  .LBB27_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxr x8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp x8, x0
-; OUTLINE_ATOMICS-NEXT:    csel x10, x8, x0, le
-; OUTLINE_ATOMICS-NEXT:    stlxr w11, x10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB27_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov x0, x8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var64
+; CHECK-NEXT:    add x9, x9, :lo12:var64
+; CHECK-NEXT:  .LBB27_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr x8, [x9]
+; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    csel x10, x8, x0, le
+; CHECK-NEXT:    stlxr w11, x10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB27_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
    %old = atomicrmw min i64* @var64, i64 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-
-; CHECK-NEXT: cmp x[[OLD]], x0
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
-
-
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_max_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_max_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:  .LBB28_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxrb w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    sxtb w8, w10
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, sxtb
-; OUTLINE_ATOMICS-NEXT:    csel w10, w10, w0, gt
-; OUTLINE_ATOMICS-NEXT:    stlxrb w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB28_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var8
+; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:  .LBB28_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrb w10, [x9]
+; CHECK-NEXT:    sxtb w8, w10
+; CHECK-NEXT:    cmp w8, w0, sxtb
+; CHECK-NEXT:    csel w10, w10, w0, gt
+; CHECK-NEXT:    stlxrb w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB28_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw max i8* @var8, i8 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-
-
-; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_max_i16:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_max_i16:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
-; OUTLINE_ATOMICS-NEXT:  .LBB29_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxrh w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    sxth w8, w10
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, sxth
-; OUTLINE_ATOMICS-NEXT:    csel w10, w10, w0, gt
-; OUTLINE_ATOMICS-NEXT:    stxrh w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB29_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var16
+; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:  .LBB29_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrh w10, [x9]
+; CHECK-NEXT:    sxth w8, w10
+; CHECK-NEXT:    cmp w8, w0, sxth
+; CHECK-NEXT:    csel w10, w10, w0, gt
+; CHECK-NEXT:    stxrh w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB29_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw max i16* @var16, i16 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-
-
-; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_max_i32:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_max_i32:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var32
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
-; OUTLINE_ATOMICS-NEXT:  .LBB30_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldxr w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, gt
-; OUTLINE_ATOMICS-NEXT:    stlxr w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB30_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var32
+; CHECK-NEXT:    add x9, x9, :lo12:var32
+; CHECK-NEXT:  .LBB30_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldxr w8, [x9]
+; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    csel w10, w8, w0, gt
+; CHECK-NEXT:    stlxr w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB30_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw max i32* @var32, i32 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-
-
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_max_i64:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_max_i64:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var64
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
-; OUTLINE_ATOMICS-NEXT:  .LBB31_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldxr x8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp x8, x0
-; OUTLINE_ATOMICS-NEXT:    csel x10, x8, x0, gt
-; OUTLINE_ATOMICS-NEXT:    stxr w11, x10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB31_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov x0, x8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var64
+; CHECK-NEXT:    add x9, x9, :lo12:var64
+; CHECK-NEXT:  .LBB31_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldxr x8, [x9]
+; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    csel x10, x8, x0, gt
+; CHECK-NEXT:    stxr w11, x10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB31_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
    %old = atomicrmw max i64* @var64, i64 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-
-; CHECK-NEXT: cmp x[[OLD]], x0
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
-
-
-; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umin_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:  .LBB32_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldxrb w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, uxtb
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, ls
-; OUTLINE_ATOMICS-NEXT:    stxrb w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB32_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var8
+; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:  .LBB32_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldxrb w8, [x9]
+; CHECK-NEXT:    cmp w8, w0, uxtb
+; CHECK-NEXT:    csel w10, w8, w0, ls
+; CHECK-NEXT:    stxrb w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB32_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw umin i8* @var8, i8 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
-
-
-; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umin_i16:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
-; OUTLINE_ATOMICS-NEXT:  .LBB33_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxrh w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, uxth
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, ls
-; OUTLINE_ATOMICS-NEXT:    stxrh w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB33_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var16
+; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:  .LBB33_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrh w8, [x9]
+; CHECK-NEXT:    cmp w8, w0, uxth
+; CHECK-NEXT:    csel w10, w8, w0, ls
+; CHECK-NEXT:    stxrh w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB33_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw umin i16* @var16, i16 %offset acquire
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
-
-
-; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i32:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umin_i32:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var32
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
-; OUTLINE_ATOMICS-NEXT:  .LBB34_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxr w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, ls
-; OUTLINE_ATOMICS-NEXT:    stlxr w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB34_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var32
+; CHECK-NEXT:    add x9, x9, :lo12:var32
+; CHECK-NEXT:  .LBB34_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr w8, [x9]
+; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    csel w10, w8, w0, ls
+; CHECK-NEXT:    stlxr w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB34_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw umin i32* @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
-
-
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i64:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umin_i64:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var64
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
-; OUTLINE_ATOMICS-NEXT:  .LBB35_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxr x8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp x8, x0
-; OUTLINE_ATOMICS-NEXT:    csel x10, x8, x0, ls
-; OUTLINE_ATOMICS-NEXT:    stlxr w11, x10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB35_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov x0, x8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var64
+; CHECK-NEXT:    add x9, x9, :lo12:var64
+; CHECK-NEXT:  .LBB35_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr x8, [x9]
+; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    csel x10, x8, x0, ls
+; CHECK-NEXT:    stlxr w11, x10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB35_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
    %old = atomicrmw umin i64* @var64, i64 %offset acq_rel
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-
-; CHECK-NEXT: cmp x[[OLD]], x0
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
-
-
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umax_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:  .LBB36_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxrb w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, uxtb
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, hi
-; OUTLINE_ATOMICS-NEXT:    stlxrb w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB36_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var8
+; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:  .LBB36_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrb w8, [x9]
+; CHECK-NEXT:    cmp w8, w0, uxtb
+; CHECK-NEXT:    csel w10, w8, w0, hi
+; CHECK-NEXT:    stlxrb w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB36_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw umax i8* @var8, i8 %offset acq_rel
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-
-
-; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umax_i16:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
-; OUTLINE_ATOMICS-NEXT:  .LBB37_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldxrh w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0, uxth
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, hi
-; OUTLINE_ATOMICS-NEXT:    stxrh w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB37_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var16
+; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:  .LBB37_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldxrh w8, [x9]
+; CHECK-NEXT:    cmp w8, w0, uxth
+; CHECK-NEXT:    csel w10, w8, w0, hi
+; CHECK-NEXT:    stxrh w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB37_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw umax i16* @var16, i16 %offset monotonic
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-
-
-; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i32:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umax_i32:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var32
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
-; OUTLINE_ATOMICS-NEXT:  .LBB38_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldaxr w8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp w8, w0
-; OUTLINE_ATOMICS-NEXT:    csel w10, w8, w0, hi
-; OUTLINE_ATOMICS-NEXT:    stlxr w11, w10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB38_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov w0, w8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var32
+; CHECK-NEXT:    add x9, x9, :lo12:var32
+; CHECK-NEXT:  .LBB38_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr w8, [x9]
+; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    csel w10, w8, w0, hi
+; CHECK-NEXT:    stlxr w11, w10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB38_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
    %old = atomicrmw umax i32* @var32, i32 %offset seq_cst
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-
-; CHECK-NEXT: cmp w[[OLD]], w0
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-
-
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
 define dso_local i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i64:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_umax_i64:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x9, var64
-; OUTLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
-; OUTLINE_ATOMICS-NEXT:  .LBB39_1: // %atomicrmw.start
-; OUTLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE_ATOMICS-NEXT:    ldxr x8, [x9]
-; OUTLINE_ATOMICS-NEXT:    cmp x8, x0
-; OUTLINE_ATOMICS-NEXT:    csel x10, x8, x0, hi
-; OUTLINE_ATOMICS-NEXT:    stlxr w11, x10, [x9]
-; OUTLINE_ATOMICS-NEXT:    cbnz w11, .LBB39_1
-; OUTLINE_ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
-; OUTLINE_ATOMICS-NEXT:    mov x0, x8
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x9, var64
+; CHECK-NEXT:    add x9, x9, :lo12:var64
+; CHECK-NEXT:  .LBB39_1: // %atomicrmw.start
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldxr x8, [x9]
+; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    csel x10, x8, x0, hi
+; CHECK-NEXT:    stlxr w11, x10, [x9]
+; CHECK-NEXT:    cbnz w11, .LBB39_1
+; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
    %old = atomicrmw umax i64* @var64, i64 %offset release
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; x0 below is a reasonable guess but could change: it certainly comes into the
-  ; function there.
-
-; CHECK-NEXT: cmp x[[OLD]], x0
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
-
-
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
-; CHECK-NOT: dmb
-
-; CHECK: mov x0, x[[OLD]]
    ret i64 %old
 }
 
 define dso_local i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i8:
+; INLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i8:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; INLINE_ATOMICS-NEXT:    adrp x9, var8
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; INLINE_ATOMICS-NEXT:  .LBB40_1: // %cmpxchg.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxrb w8, [x9]
+; INLINE_ATOMICS-NEXT:    cmp w8, w0, uxtb
+; INLINE_ATOMICS-NEXT:    b.ne .LBB40_4
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %cmpxchg.trystore
+; INLINE_ATOMICS-NEXT:    // in Loop: Header=BB40_1 Depth=1
+; INLINE_ATOMICS-NEXT:    stxrb w10, w1, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB40_1
+; INLINE_ATOMICS-NEXT:  // %bb.3: // %cmpxchg.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+; INLINE_ATOMICS-NEXT:  .LBB40_4: // %cmpxchg.nostore
+; INLINE_ATOMICS-NEXT:    clrex
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i8:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -1316,29 +1004,32 @@ define dso_local i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ret
    %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
    %old = extractvalue { i8, i1 } %pair, 0
-
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: cmp w[[OLD]], w0
-; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
-; CHECK: [[GET_OUT]]:
-; CHECK: clrex
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
 define dso_local i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i16:
+; INLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i16:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; INLINE_ATOMICS-NEXT:    adrp x9, var16
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; INLINE_ATOMICS-NEXT:  .LBB41_1: // %cmpxchg.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldaxrh w8, [x9]
+; INLINE_ATOMICS-NEXT:    cmp w8, w0, uxth
+; INLINE_ATOMICS-NEXT:    b.ne .LBB41_4
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %cmpxchg.trystore
+; INLINE_ATOMICS-NEXT:    // in Loop: Header=BB41_1 Depth=1
+; INLINE_ATOMICS-NEXT:    stlxrh w10, w1, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB41_1
+; INLINE_ATOMICS-NEXT:  // %bb.3: // %cmpxchg.end
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+; INLINE_ATOMICS-NEXT:  .LBB41_4: // %cmpxchg.nostore
+; INLINE_ATOMICS-NEXT:    clrex
+; INLINE_ATOMICS-NEXT:    mov w0, w8
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i16:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -1349,29 +1040,32 @@ define dso_local i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ret
    %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
    %old = extractvalue { i16, i1 } %pair, 0
-
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: cmp w[[OLD]], w0
-; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
-; CHECK: [[GET_OUT]]:
-; CHECK: clrex
-; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
 define dso_local i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i32:
+; INLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i32:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    mov w8, w0
+; INLINE_ATOMICS-NEXT:    adrp x9, var32
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var32
+; INLINE_ATOMICS-NEXT:  .LBB42_1: // %cmpxchg.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxr w0, [x9]
+; INLINE_ATOMICS-NEXT:    cmp w0, w8
+; INLINE_ATOMICS-NEXT:    b.ne .LBB42_4
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %cmpxchg.trystore
+; INLINE_ATOMICS-NEXT:    // in Loop: Header=BB42_1 Depth=1
+; INLINE_ATOMICS-NEXT:    stlxr w10, w1, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB42_1
+; INLINE_ATOMICS-NEXT:  // %bb.3: // %cmpxchg.end
+; INLINE_ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; INLINE_ATOMICS-NEXT:    ret
+; INLINE_ATOMICS-NEXT:  .LBB42_4: // %cmpxchg.nostore
+; INLINE_ATOMICS-NEXT:    clrex
+; INLINE_ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i32:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -1382,27 +1076,31 @@ define dso_local i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ret
    %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
    %old = extractvalue { i32, i1 } %pair, 0
-
-; CHECK: mov {{[xw]}}[[WANTED:[0-9]+]], {{[xw]}}0
-
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
-
-; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-; CHECK-NEXT: cmp w[[OLD]], w[[WANTED]]
-; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
-; CHECK: [[GET_OUT]]:
-; CHECK: clrex
-; CHECK-NOT: dmb
    ret i32 %old
 }
 
 define dso_local void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
-; CHECK-LABEL: test_atomic_cmpxchg_i64:
+; INLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i64:
+; INLINE_ATOMICS:       // %bb.0:
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    add x9, x9, :lo12:var64
+; INLINE_ATOMICS-NEXT:  .LBB43_1: // %cmpxchg.start
+; INLINE_ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
+; INLINE_ATOMICS-NEXT:    ldxr x8, [x9]
+; INLINE_ATOMICS-NEXT:    cmp x8, x0
+; INLINE_ATOMICS-NEXT:    b.ne .LBB43_3
+; INLINE_ATOMICS-NEXT:  // %bb.2: // %cmpxchg.trystore
+; INLINE_ATOMICS-NEXT:    // in Loop: Header=BB43_1 Depth=1
+; INLINE_ATOMICS-NEXT:    stxr w10, x1, [x9]
+; INLINE_ATOMICS-NEXT:    cbnz w10, .LBB43_1
+; INLINE_ATOMICS-NEXT:    b .LBB43_4
+; INLINE_ATOMICS-NEXT:  .LBB43_3: // %cmpxchg.nostore
+; INLINE_ATOMICS-NEXT:    clrex
+; INLINE_ATOMICS-NEXT:  .LBB43_4: // %cmpxchg.end
+; INLINE_ATOMICS-NEXT:    adrp x9, var64
+; INLINE_ATOMICS-NEXT:    str x8, [x9, :lo12:var64]
+; INLINE_ATOMICS-NEXT:    ret
+;
 ; OUTLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i64:
 ; OUTLINE_ATOMICS:       // %bb.0:
 ; OUTLINE_ATOMICS-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
@@ -1415,268 +1113,155 @@ define dso_local void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; OUTLINE_ATOMICS-NEXT:    ret
    %pair = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
    %old = extractvalue { i64, i1 } %pair, 0
-
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
-
-; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: cmp x[[OLD]], x0
-; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
-; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
-; CHECK: [[GET_OUT]]:
-; CHECK: clrex
-; CHECK-NOT: dmb
-
-; CHECK: str x[[OLD]],
    store i64 %old, i64* @var64
    ret void
 }
 
 define dso_local i8 @test_atomic_load_monotonic_i8() nounwind {
 ; CHECK-LABEL: test_atomic_load_monotonic_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_monotonic_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var8
-; OUTLINE_ATOMICS-NEXT:    ldrb w0, [x8, :lo12:var8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    ldrb w0, [x8, :lo12:var8]
+; CHECK-NEXT:    ret
   %val = load atomic i8, i8* @var8 monotonic, align 1
-; CHECK-NOT: dmb
-; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: ldrb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
-; CHECK-NOT: dmb
-
   ret i8 %val
 }
 
 define dso_local i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind {
 ; CHECK-LABEL: test_atomic_load_monotonic_regoff_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_monotonic_regoff_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    ldrb w0, [x0, x1]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w0, [x0, x1]
+; CHECK-NEXT:    ret
   %addr_int = add i64 %base, %off
   %addr = inttoptr i64 %addr_int to i8*
-
   %val = load atomic i8, i8* %addr monotonic, align 1
-; CHECK-NOT: dmb
-; CHECK: ldrb w0, [x0, x1]
-; CHECK-NOT: dmb
-
   ret i8 %val
 }
 
 define dso_local i8 @test_atomic_load_acquire_i8() nounwind {
 ; CHECK-LABEL: test_atomic_load_acquire_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_acquire_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var8
-; OUTLINE_ATOMICS-NEXT:    add x8, x8, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:    ldarb w0, [x8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    ldarb w0, [x8]
+; CHECK-NEXT:    ret
   %val = load atomic i8, i8* @var8 acquire, align 1
-; CHECK-NOT: dmb
-; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-; CHECK-NOT: dmb
-; CHECK: ldarb w0, [x[[ADDR]]]
-; CHECK-NOT: dmb
   ret i8 %val
 }
 
 define dso_local i8 @test_atomic_load_seq_cst_i8() nounwind {
 ; CHECK-LABEL: test_atomic_load_seq_cst_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_seq_cst_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var8
-; OUTLINE_ATOMICS-NEXT:    add x8, x8, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:    ldarb w0, [x8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    ldarb w0, [x8]
+; CHECK-NEXT:    ret
   %val = load atomic i8, i8* @var8 seq_cst, align 1
-; CHECK-NOT: dmb
-; CHECK: adrp [[HIADDR:x[0-9]+]], var8
-; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
-; CHECK-NOT: dmb
-; CHECK: ldarb w0, [x[[ADDR]]]
-; CHECK-NOT: dmb
   ret i8 %val
 }
 
 define dso_local i16 @test_atomic_load_monotonic_i16() nounwind {
 ; CHECK-LABEL: test_atomic_load_monotonic_i16:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_monotonic_i16:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var16
-; OUTLINE_ATOMICS-NEXT:    ldrh w0, [x8, :lo12:var16]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var16
+; CHECK-NEXT:    ldrh w0, [x8, :lo12:var16]
+; CHECK-NEXT:    ret
   %val = load atomic i16, i16* @var16 monotonic, align 2
-; CHECK-NOT: dmb
-; CHECK: adrp x[[HIADDR:[0-9]+]], var16
-; CHECK-NOT: dmb
-; CHECK: ldrh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
-; CHECK-NOT: dmb
-
   ret i16 %val
 }
 
 define dso_local i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind {
 ; CHECK-LABEL: test_atomic_load_monotonic_regoff_i32:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_monotonic_regoff_i32:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    ldr w0, [x0, x1]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0, x1]
+; CHECK-NEXT:    ret
   %addr_int = add i64 %base, %off
   %addr = inttoptr i64 %addr_int to i32*
-
   %val = load atomic i32, i32* %addr monotonic, align 4
-; CHECK-NOT: dmb
-; CHECK: ldr w0, [x0, x1]
-; CHECK-NOT: dmb
-
   ret i32 %val
 }
 
 define dso_local i64 @test_atomic_load_seq_cst_i64() nounwind {
 ; CHECK-LABEL: test_atomic_load_seq_cst_i64:
-; OUTLINE_ATOMICS-LABEL: test_atomic_load_seq_cst_i64:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var64
-; OUTLINE_ATOMICS-NEXT:    add x8, x8, :lo12:var64
-; OUTLINE_ATOMICS-NEXT:    ldar x0, [x8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var64
+; CHECK-NEXT:    add x8, x8, :lo12:var64
+; CHECK-NEXT:    ldar x0, [x8]
+; CHECK-NEXT:    ret
   %val = load atomic i64, i64* @var64 seq_cst, align 8
-; CHECK-NOT: dmb
-; CHECK: adrp [[HIADDR:x[0-9]+]], var64
-; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
-; CHECK-NOT: dmb
-; CHECK: ldar x0, [x[[ADDR]]]
-; CHECK-NOT: dmb
   ret i64 %val
 }
 
 define dso_local void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_monotonic_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_store_monotonic_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var8
-; OUTLINE_ATOMICS-NEXT:    strb w0, [x8, :lo12:var8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    strb w0, [x8, :lo12:var8]
+; CHECK-NEXT:    ret
   store atomic i8 %val, i8* @var8 monotonic, align 1
-; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: strb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
-
   ret void
 }
 
 define dso_local void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_monotonic_regoff_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_store_monotonic_regoff_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    strb w2, [x0, x1]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strb w2, [x0, x1]
+; CHECK-NEXT:    ret
   %addr_int = add i64 %base, %off
   %addr = inttoptr i64 %addr_int to i8*
-
   store atomic i8 %val, i8* %addr monotonic, align 1
-; CHECK: strb w2, [x0, x1]
-
   ret void
 }
 define dso_local void @test_atomic_store_release_i8(i8 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_release_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_store_release_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var8
-; OUTLINE_ATOMICS-NEXT:    add x8, x8, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:    stlrb w0, [x8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    stlrb w0, [x8]
+; CHECK-NEXT:    ret
   store atomic i8 %val, i8* @var8 release, align 1
-; CHECK-NOT: dmb
-; CHECK: adrp [[HIADDR:x[0-9]+]], var8
-; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
-; CHECK-NOT: dmb
-; CHECK: stlrb w0, [x[[ADDR]]]
-; CHECK-NOT: dmb
   ret void
 }
 
 define dso_local void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_seq_cst_i8:
-; OUTLINE_ATOMICS-LABEL: test_atomic_store_seq_cst_i8:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var8
-; OUTLINE_ATOMICS-NEXT:    add x8, x8, :lo12:var8
-; OUTLINE_ATOMICS-NEXT:    stlrb w0, [x8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    stlrb w0, [x8]
+; CHECK-NEXT:    ret
   store atomic i8 %val, i8* @var8 seq_cst, align 1
-; CHECK-NOT: dmb
-; CHECK: adrp [[HIADDR:x[0-9]+]], var8
-; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
-; CHECK-NOT: dmb
-; CHECK: stlrb w0, [x[[ADDR]]]
-; CHECK-NOT: dmb
-
   ret void
 }
 
 define dso_local void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_monotonic_i16:
-; OUTLINE_ATOMICS-LABEL: test_atomic_store_monotonic_i16:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var16
-; OUTLINE_ATOMICS-NEXT:    strh w0, [x8, :lo12:var16]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var16
+; CHECK-NEXT:    strh w0, [x8, :lo12:var16]
+; CHECK-NEXT:    ret
   store atomic i16 %val, i16* @var16 monotonic, align 2
-; CHECK-NOT: dmb
-; CHECK: adrp x[[HIADDR:[0-9]+]], var16
-; CHECK-NOT: dmb
-; CHECK: strh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
-; CHECK-NOT: dmb
   ret void
 }
 
 define dso_local void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_monotonic_regoff_i32:
-; OUTLINE_ATOMICS-LABEL: test_atomic_store_monotonic_regoff_i32:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    str w2, [x0, x1]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str w2, [x0, x1]
+; CHECK-NEXT:    ret
   %addr_int = add i64 %base, %off
   %addr = inttoptr i64 %addr_int to i32*
-
   store atomic i32 %val, i32* %addr monotonic, align 4
-; CHECK-NOT: dmb
-; CHECK: str w2, [x0, x1]
-; CHECK-NOT: dmb
-
   ret void
 }
 
 define dso_local void @test_atomic_store_release_i64(i64 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_release_i64:
-; OUTLINE_ATOMICS-LABEL: test_atomic_store_release_i64:
-; OUTLINE_ATOMICS:       // %bb.0:
-; OUTLINE_ATOMICS-NEXT:    adrp x8, var64
-; OUTLINE_ATOMICS-NEXT:    add x8, x8, :lo12:var64
-; OUTLINE_ATOMICS-NEXT:    stlr x0, [x8]
-; OUTLINE_ATOMICS-NEXT:    ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, var64
+; CHECK-NEXT:    add x8, x8, :lo12:var64
+; CHECK-NEXT:    stlr x0, [x8]
+; CHECK-NEXT:    ret
   store atomic i64 %val, i64* @var64 release, align 8
-; CHECK-NOT: dmb
-; CHECK: adrp [[HIADDR:x[0-9]+]], var64
-; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
-; CHECK-NOT: dmb
-; CHECK: stlr x0, [x[[ADDR]]]
-; CHECK-NOT: dmb
   ret void
 }

diff  --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
index 2b356c5981a8..12d5b20409cf 100644
--- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -1,56 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-apple-ios7.0 -mattr=+outline-atomics -o - %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
 
 define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
-; OUTLINE-ATOMICS: bl ___aarch64_cas4_acq_rel
 ; CHECK-LABEL: test_return:
-
-; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
-; CHECK: cmp [[LOADED]], w1
-; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
-
-; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
-; CHECK: cbnz [[STATUS]], [[LOOP]]
-
-; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: mov w0, #1
-; CHECK: ret
-
-; CHECK: [[FAILED]]:
-; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: mov w0, wzr
-; CHECK: ret
-
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  LBB0_1: ; %cmpxchg.start
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr w8, [x0]
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    b.ne LBB0_4
+; CHECK-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    stlxr w8, w2, [x0]
+; CHECK-NEXT:    cbnz w8, LBB0_1
+; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB0_4: ; %cmpxchg.nostore
+; CHECK-NEXT:    clrex
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+;
+; OUTLINE-ATOMICS-LABEL: test_return:
+; OUTLINE-ATOMICS:       ; %bb.0:
+; OUTLINE-ATOMICS-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    .cfi_def_cfa_offset 32
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w30, -8
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
+; OUTLINE-ATOMICS-NEXT:    mov x8, x0
+; OUTLINE-ATOMICS-NEXT:    mov w19, w1
+; OUTLINE-ATOMICS-NEXT:    mov w0, w1
+; OUTLINE-ATOMICS-NEXT:    mov w1, w2
+; OUTLINE-ATOMICS-NEXT:    mov x2, x8
+; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
+; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w19
+; OUTLINE-ATOMICS-NEXT:    cset w0, eq
+; OUTLINE-ATOMICS-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    ret
   %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
   %success = extractvalue { i32, i1 } %pair, 1
   %conv = zext i1 %success to i32
   ret i32 %conv
 }
 
+; FIXME: DAG combine should be able to deal with this EOR better.
 define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
-; OUTLINE-ATOMICS: bl ___aarch64_cas1_acq_rel
 ; CHECK-LABEL: test_return_bool:
-
-; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxrb [[LOADED:w[0-9]+]], [x0]
-; CHECK: cmp [[LOADED]], w1, uxtb
-; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
-
-; CHECK: stlxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
-; CHECK: cbnz [[STATUS]], [[LOOP]]
-
-; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-  ; FIXME: DAG combine should be able to deal with this.
-; CHECK: mov [[TMP:w[0-9]+]], #1
-; CHECK: eor w0, [[TMP]], #0x1
-; CHECK: ret
-
-; CHECK: [[FAILED]]:
-; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: eor w0, wzr, #0x1
-; CHECK: ret
-
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ; kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT:  LBB1_1: ; %cmpxchg.start
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxrb w8, [x0]
+; CHECK-NEXT:    cmp w8, w1, uxtb
+; CHECK-NEXT:    b.ne LBB1_4
+; CHECK-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    stlxrb w8, w2, [x0]
+; CHECK-NEXT:    cbnz w8, LBB1_1
+; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    eor w0, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB1_4: ; %cmpxchg.nostore
+; CHECK-NEXT:    clrex
+; CHECK-NEXT:    eor w0, wzr, #0x1
+; CHECK-NEXT:    ret
+;
+; OUTLINE-ATOMICS-LABEL: test_return_bool:
+; OUTLINE-ATOMICS:       ; %bb.0:
+; OUTLINE-ATOMICS-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    .cfi_def_cfa_offset 32
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w30, -8
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
+; OUTLINE-ATOMICS-NEXT:    mov x8, x0
+; OUTLINE-ATOMICS-NEXT:    mov w19, w1
+; OUTLINE-ATOMICS-NEXT:    mov w0, w1
+; OUTLINE-ATOMICS-NEXT:    mov w1, w2
+; OUTLINE-ATOMICS-NEXT:    mov x2, x8
+; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas1_acq_rel
+; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w19, uxtb
+; OUTLINE-ATOMICS-NEXT:    cset w8, eq
+; OUTLINE-ATOMICS-NEXT:    eor w0, w8, #0x1
+; OUTLINE-ATOMICS-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    ret
   %pair = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
   %success = extractvalue { i8, i1 } %pair, 1
   %failure = xor i1 %success, 1
@@ -58,24 +99,48 @@ define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
 }
 
 define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
-; OUTLINE-ATOMICS: bl ___aarch64_cas4_acq_rel
 ; CHECK-LABEL: test_conditional:
-
-; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
-; CHECK: cmp [[LOADED]], w1
-; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
-
-; CHECK: stlxr [[STATUS:w[0-9]+]], w2, [x0]
-; CHECK: cbnz [[STATUS]], [[LOOP]]
-
-; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: b _bar
-
-; CHECK: [[FAILED]]:
-; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: b _baz
-
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  LBB2_1: ; %cmpxchg.start
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr w8, [x0]
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    b.ne LBB2_4
+; CHECK-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NEXT:    ; in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    stlxr w8, w2, [x0]
+; CHECK-NEXT:    cbnz w8, LBB2_1
+; CHECK-NEXT:  ; %bb.3: ; %true
+; CHECK-NEXT:    b _bar
+; CHECK-NEXT:  LBB2_4: ; %cmpxchg.nostore
+; CHECK-NEXT:    clrex
+; CHECK-NEXT:    b _baz
+;
+; OUTLINE-ATOMICS-LABEL: test_conditional:
+; OUTLINE-ATOMICS:       ; %bb.0:
+; OUTLINE-ATOMICS-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    .cfi_def_cfa_offset 32
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w30, -8
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
+; OUTLINE-ATOMICS-NEXT:    mov x8, x0
+; OUTLINE-ATOMICS-NEXT:    mov w19, w1
+; OUTLINE-ATOMICS-NEXT:    mov w0, w1
+; OUTLINE-ATOMICS-NEXT:    mov w1, w2
+; OUTLINE-ATOMICS-NEXT:    mov x2, x8
+; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w19
+; OUTLINE-ATOMICS-NEXT:    b.ne LBB2_2
+; OUTLINE-ATOMICS-NEXT:  ; %bb.1: ; %true
+; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    b _bar
+; OUTLINE-ATOMICS-NEXT:  LBB2_2: ; %false
+; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    b _baz
   %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
   %success = extractvalue { i32, i1 } %pair, 1
   br i1 %success, label %true, label %false
@@ -95,31 +160,106 @@ end:
 declare void @bar()
 declare void @baz()
 
+; verify the preheader is simplified by simplifycfg.
 define i1 @test_conditional2(i32 %a, i32 %b, i32* %c) {
-; OUTLINE-ATOMICS: bl ___aarch64_cas4_acq_rel
 ; CHECK-LABEL: test_conditional2:
-; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr [[LOADED:w[0-9]+]], [x19]
-; CHECK: cmp [[LOADED]], w21
-; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
-
-; CHECK: stlxr [[STATUS:w[0-9]+]], w20, [x19]
-; CHECK: cbnz [[STATUS]], [[LOOP]]
-; CHECK: mov [[STATUS]], #1
-; CHECK: b [[PH:LBB[0-9]+_[0-9]+]]
-
-; CHECK: [[FAILED]]:
-; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-
-; verify the preheader is simplified by simplifycfg.
-; CHECK: [[PH]]:
-; CHECK: mov w22, #2
-; CHECK-NOT: mov w22, #4
-; CHECK-NOT: cmn w22, #4
-; CHECK: [[LOOP2:LBB[0-9]+_[0-9]+]]: ; %for.cond
-; CHECK-NOT: b.ne [[LOOP2]]
-; CHECK-NOT: b {{LBB[0-9]+_[0-9]+}}
-; CHECK: bl _foo
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset w19, -24
+; CHECK-NEXT:    .cfi_offset w20, -32
+; CHECK-NEXT:    .cfi_offset w21, -40
+; CHECK-NEXT:    .cfi_offset w22, -48
+; CHECK-NEXT:    mov x19, x2
+; CHECK-NEXT:    mov w20, w1
+; CHECK-NEXT:    mov w21, w0
+; CHECK-NEXT:  LBB3_1: ; %cmpxchg.start
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldaxr w8, [x19]
+; CHECK-NEXT:    cmp w8, w21
+; CHECK-NEXT:    b.ne LBB3_4
+; CHECK-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    stlxr w8, w20, [x19]
+; CHECK-NEXT:    cbnz w8, LBB3_1
+; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    b LBB3_5
+; CHECK-NEXT:  LBB3_4: ; %cmpxchg.nostore
+; CHECK-NEXT:    clrex
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  LBB3_5: ; %for.cond.preheader
+; CHECK-NEXT:    mov w22, #2
+; CHECK-NEXT:  LBB3_6: ; %for.cond
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cbz w22, LBB3_9
+; CHECK-NEXT:  ; %bb.7: ; %for.body
+; CHECK-NEXT:    ; in Loop: Header=BB3_6 Depth=1
+; CHECK-NEXT:    sub w22, w22, #1
+; CHECK-NEXT:    ldr w10, [x19, w22, sxtw #2]
+; CHECK-NEXT:    orr w9, w21, w20
+; CHECK-NEXT:    cmp w9, w10
+; CHECK-NEXT:    b.eq LBB3_6
+; CHECK-NEXT:  ; %bb.8: ; %if.then
+; CHECK-NEXT:    ; in Loop: Header=BB3_6 Depth=1
+; CHECK-NEXT:    sxtw x8, w22
+; CHECK-NEXT:    str w9, [x19, x8, lsl #2]
+; CHECK-NEXT:    bl _foo
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b LBB3_6
+; CHECK-NEXT:  LBB3_9: ; %for.cond.cleanup
+; CHECK-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; OUTLINE-ATOMICS-LABEL: test_conditional2:
+; OUTLINE-ATOMICS:       ; %bb.0: ; %entry
+; OUTLINE-ATOMICS-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; OUTLINE-ATOMICS-NEXT:    .cfi_def_cfa_offset 48
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w30, -8
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w29, -16
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w19, -24
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w20, -32
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w21, -40
+; OUTLINE-ATOMICS-NEXT:    .cfi_offset w22, -48
+; OUTLINE-ATOMICS-NEXT:    mov x19, x2
+; OUTLINE-ATOMICS-NEXT:    mov w20, w1
+; OUTLINE-ATOMICS-NEXT:    mov w21, w0
+; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w21
+; OUTLINE-ATOMICS-NEXT:    cset w8, eq
+; OUTLINE-ATOMICS-NEXT:    mov w22, #2
+; OUTLINE-ATOMICS-NEXT:  LBB3_1: ; %for.cond
+; OUTLINE-ATOMICS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; OUTLINE-ATOMICS-NEXT:    cbz w22, LBB3_4
+; OUTLINE-ATOMICS-NEXT:  ; %bb.2: ; %for.body
+; OUTLINE-ATOMICS-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; OUTLINE-ATOMICS-NEXT:    sub w22, w22, #1
+; OUTLINE-ATOMICS-NEXT:    ldr w10, [x19, w22, sxtw #2]
+; OUTLINE-ATOMICS-NEXT:    orr w9, w21, w20
+; OUTLINE-ATOMICS-NEXT:    cmp w9, w10
+; OUTLINE-ATOMICS-NEXT:    b.eq LBB3_1
+; OUTLINE-ATOMICS-NEXT:  ; %bb.3: ; %if.then
+; OUTLINE-ATOMICS-NEXT:    ; in Loop: Header=BB3_1 Depth=1
+; OUTLINE-ATOMICS-NEXT:    sxtw x8, w22
+; OUTLINE-ATOMICS-NEXT:    str w9, [x19, x8, lsl #2]
+; OUTLINE-ATOMICS-NEXT:    bl _foo
+; OUTLINE-ATOMICS-NEXT:    mov w8, wzr
+; OUTLINE-ATOMICS-NEXT:    b LBB3_1
+; OUTLINE-ATOMICS-NEXT:  LBB3_4: ; %for.cond.cleanup
+; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    and w0, w8, #0x1
+; OUTLINE-ATOMICS-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
+; OUTLINE-ATOMICS-NEXT:    ret
 entry:
   %pair = cmpxchg i32* %c, i32 %a, i32 %b seq_cst seq_cst
   %success = extractvalue { i32, i1 } %pair, 1

diff  --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index 905a12843866..4a3002554cde 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -109,6 +109,19 @@ define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
   ret i1 %tmp1
 }
 
+define i1 @add_ugecmp_i32_i16_i8(i16 %xx) nounwind {
+; CHECK-LABEL: add_ugecmp_i32_i16_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    cmp w8, w8, sxtb
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %x = zext i16 %xx to i32
+  %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
+  %tmp1 = icmp uge i32 %tmp0, -256 ; ~0U << 8
+  ret i1 %tmp1
+}
+
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i16:
 ; CHECK:       // %bb.0:

diff  --git a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
new file mode 100644
index 000000000000..c9efc1d0418e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
@@ -0,0 +1,347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_add:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w1, w0
+; CHECK-NEXT:    orr w8, w8, #0x1
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    cmp w8, #1024
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %add = add i16 %b, %a
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_sub:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    orr w8, w8, #0x1
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    cmp w8, #1024
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %add = sub i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul w8, w1, w0
+; CHECK-NEXT:    orr w8, w8, #0x1
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    cmp w8, #1024
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %add = mul i16 %b, %a
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: overflow_shl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w8, w0, w1
+; CHECK-NEXT:    orr w8, w8, #0x1
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    cmp w8, #1024
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %add = shl i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) {
+; CHECK-LABEL: overflow_add_no_consts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w1, w0
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w2, w8, uxtb
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    ret
+  %add = add i8 %b, %a
+  %cmp = icmp ugt i8 %add, %limit
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: overflow_add_const_limit:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w1, w0
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w8, #128
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %add = add i8 %b, %a
+  %cmp = icmp ugt i8 %add, -128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: overflow_add_positive_const_limit:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w8, w0, sxtb
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, -1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: unsafe_add_underflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, #1
+; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    csel w0, w9, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i8 %a, 1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: safe_add_underflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    csel w0, w9, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i8 %a, 0
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: safe_add_underflow_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, #2
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w8, #251
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    ret
+  %add = add i8 %a, -2
+  %cmp = icmp ult i8 %add, -5
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: overflow_sub_negative_const_limit:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w8, w0, sxtb
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i8 %a, -1
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: unsafe_sub_underflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, #6
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w8, #250
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -6
+  %cmp = icmp ugt i8 %sub, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: safe_sub_underflow:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    csel w0, w9, w8, eq
+; CHECK-NEXT:    ret
+  %cmp.not = icmp eq i8 %a, 0
+  %res = select i1 %cmp.not, i32 16, i32 8
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: safe_sub_underflow_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, #4
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w8, #250
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -4
+  %cmp = icmp ugt i8 %sub, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: unsafe_sub_underflow_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, #4
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    cmp w8, #253
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    ret
+  %sub = add i8 %a, -4
+  %cmp = icmp ult i8 %sub, -3
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_imm_var(i8* nocapture readonly %b) local_unnamed_addr #1 {
+; CHECK-LABEL: safe_sub_imm_var:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+entry:
+  ret i32 0
+}
+
+define i32 @safe_sub_var_imm(i8* nocapture readonly %b) local_unnamed_addr #1 {
+; CHECK-LABEL: safe_sub_var_imm:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    add w8, w8, #8
+; CHECK-NEXT:    and w8, w8, #0xff
+; CHECK-NEXT:    cmp w8, #252
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, i8* %b, align 1
+  %sub = add nsw i8 %0, 8
+  %cmp = icmp ugt i8 %sub, -4
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+define i32 @safe_add_imm_var(i8* nocapture readnone %b) {
+; CHECK-LABEL: safe_add_imm_var:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+entry:
+  ret i32 1
+}
+
+define i32 @safe_add_var_imm(i8* nocapture readnone %b) {
+; CHECK-LABEL: safe_add_var_imm:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+entry:
+  ret i32 1
+}
+
+define i8 @convert_add_order(i8 zeroext %arg) {
+; CHECK-LABEL: convert_add_order:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, w0, #0x1
+; CHECK-NEXT:    sub w10, w8, #40
+; CHECK-NEXT:    and w10, w10, #0xff
+; CHECK-NEXT:    mov w9, #1
+; CHECK-NEXT:    cmp w10, #20
+; CHECK-NEXT:    cinc w9, w9, hs
+; CHECK-NEXT:    cmp w8, #50
+; CHECK-NEXT:    csinv w8, w9, wzr, lo
+; CHECK-NEXT:    and w0, w8, w0
+; CHECK-NEXT:    ret
+  %shl = or i8 %arg, 1
+  %cmp.0 = icmp ult i8 %shl, 50
+  %sub = add nsw i8 %shl, -40
+  %cmp.1 = icmp ult i8 %sub, 20
+  %mask.sel.v = select i1 %cmp.1, i8 1, i8 2
+  %mask.sel = select i1 %cmp.0, i8 %mask.sel.v, i8 -1
+  %res = and i8 %mask.sel, %arg
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) {
+; CHECK-LABEL: underflow_if_sub:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    and w8, w8, w0
+; CHECK-NEXT:    sub w8, w8, #11
+; CHECK-NEXT:    and w9, w8, #0xff
+; CHECK-NEXT:    cmp w9, w1
+; CHECK-NEXT:    mov w9, #100
+; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %conv, %arg
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ult i8 %conv1, %arg1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) {
+; CHECK-LABEL: underflow_if_sub_signext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    and w8, w8, w0
+; CHECK-NEXT:    sub w8, w8, #11
+; CHECK-NEXT:    and w9, w8, #0xff
+; CHECK-NEXT:    cmp w9, w1, uxtb
+; CHECK-NEXT:    mov w9, #100
+; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %conv, %arg
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ult i8 %conv1, %arg1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}

diff  --git a/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll b/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll
new file mode 100644
index 000000000000..7ef74ffc5be8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+define void @phi_feeding_phi_args(i8 %a, i8 %b) {
+; CHECK-LABEL: phi_feeding_phi_args:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    cmp w8, w1, uxtb
+; CHECK-NEXT:    csel w8, w0, w1, hi
+; CHECK-NEXT:  .LBB0_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    and w9, w8, #0xff
+; CHECK-NEXT:    sub w10, w8, #2
+; CHECK-NEXT:    lsl w8, w8, #1
+; CHECK-NEXT:    cmp w9, #254
+; CHECK-NEXT:    csel w8, w10, w8, lo
+; CHECK-NEXT:    mvn w9, w8
+; CHECK-NEXT:    tst w9, #0xff
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  %0 = icmp ugt i8 %a, %b
+  br i1 %0, label %preheader, label %empty
+
+empty:                                            ; preds = %entry
+  br label %preheader
+
+preheader:                                        ; preds = %empty, %entry
+  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+  br label %loop
+
+loop:                                             ; preds = %if.end, %preheader
+  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, -2
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %loop
+  %inc = sub nuw i8 %val, 2
+  br label %if.end
+
+if.else:                                          ; preds = %loop
+  %inc1 = shl nuw i8 %val, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %inc2 = phi i8 [ %inc, %if.then ], [ %inc1, %if.else ]
+  %cmp1 = icmp eq i8 %inc2, -1
+  br i1 %cmp1, label %exit, label %loop
+
+exit:                                             ; preds = %if.end
+  ret void
+}
+
+define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: phi_feeding_phi_zeroext_args:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    csel w8, w0, w1, hi
+; CHECK-NEXT:  .LBB1_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    and w9, w8, #0xff
+; CHECK-NEXT:    sub w10, w8, #2
+; CHECK-NEXT:    lsl w8, w8, #1
+; CHECK-NEXT:    cmp w9, #254
+; CHECK-NEXT:    csel w8, w10, w8, lo
+; CHECK-NEXT:    mvn w9, w8
+; CHECK-NEXT:    tst w9, #0xff
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  %0 = icmp ugt i8 %a, %b
+  br i1 %0, label %preheader, label %empty
+
+empty:                                            ; preds = %entry
+  br label %preheader
+
+preheader:                                        ; preds = %empty, %entry
+  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+  br label %loop
+
+loop:                                             ; preds = %if.end, %preheader
+  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, -2
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %loop
+  %inc = sub nuw i8 %val, 2
+  br label %if.end
+
+if.else:                                          ; preds = %loop
+  %inc1 = shl nuw i8 %val, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %inc2 = phi i8 [ %inc, %if.then ], [ %inc1, %if.else ]
+  %cmp1 = icmp eq i8 %inc2, -1
+  br i1 %cmp1, label %exit, label %loop
+
+exit:                                             ; preds = %if.end
+  ret void
+}
+
+define void @phi_i16() {
+; CHECK-LABEL: phi_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w9, #1
+; CHECK-NEXT:  .LBB2_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    and w10, w8, #0xffff
+; CHECK-NEXT:    cmp w10, #128
+; CHECK-NEXT:    cinc w10, w9, lo
+; CHECK-NEXT:    add w8, w8, w10
+; CHECK-NEXT:    and w10, w8, #0xffff
+; CHECK-NEXT:    cmp w10, #253
+; CHECK-NEXT:    b.lo .LBB2_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:                                             ; preds = %if.end, %entry
+  %val = phi i16 [ 0, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i16 %val, 128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %loop
+  %inc = add nuw i16 %val, 2
+  br label %if.end
+
+if.else:                                          ; preds = %loop
+  %inc1 = add nuw i16 %val, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %inc2 = phi i16 [ %inc, %if.then ], [ %inc1, %if.else ]
+  %cmp1 = icmp ult i16 %inc2, 253
+  br i1 %cmp1, label %loop, label %exit
+
+exit:                                             ; preds = %if.end
+  ret void
+}
+
+define i8 @ret_i8() {
+; CHECK-LABEL: ret_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:  .LBB3_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    cmp w9, #0
+; CHECK-NEXT:    cinc w9, w8, ge
+; CHECK-NEXT:    add w0, w0, w9
+; CHECK-NEXT:    and w9, w0, #0xff
+; CHECK-NEXT:    cmp w9, #252
+; CHECK-NEXT:    b.hi .LBB3_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:                                             ; preds = %if.end, %entry
+  %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, -128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %loop
+  %inc = add nuw i8 %val, 2
+  br label %if.end
+
+if.else:                                          ; preds = %loop
+  %inc1 = add nuw i8 %val, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %inc2 = phi i8 [ %inc, %if.then ], [ %inc1, %if.else ]
+  %cmp1 = icmp ult i8 %inc2, -3
+  br i1 %cmp1, label %exit, label %loop
+
+exit:                                             ; preds = %if.end
+  ret i8 %inc2
+}
+
+define i16 @phi_multiple_undefs(i16 zeroext %arg) {
+; CHECK-LABEL: phi_multiple_undefs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    // implicit-def: $w9
+; CHECK-NEXT:  .LBB4_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    and w10, w9, #0xffff
+; CHECK-NEXT:    cmp w10, #128
+; CHECK-NEXT:    cinc w10, w8, lo
+; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and w10, w9, #0xffff
+; CHECK-NEXT:    cmp w10, #253
+; CHECK-NEXT:    b.lo .LBB4_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:                                             ; preds = %if.end, %entry
+  %val = phi i16 [ undef, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i16 %val, 128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %loop
+  %inc = add nuw i16 %val, 2
+  br label %if.end
+
+if.else:                                          ; preds = %loop
+  %inc1 = add nuw i16 %val, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %inc2 = phi i16 [ %inc, %if.then ], [ %inc1, %if.else ]
+  %unrelated = phi i16 [ undef, %if.then ], [ %arg, %if.else ]
+  %cmp1 = icmp ult i16 %inc2, 253
+  br i1 %cmp1, label %loop, label %exit
+
+exit:                                             ; preds = %if.end
+  ret i16 %unrelated
+}
+
+define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) {
+; CHECK-LABEL: promote_arg_return:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w0, lsl #1
+; CHECK-NEXT:    add w8, w8, #45
+; CHECK-NEXT:    cmp w1, w8, uxth
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    strb w8, [x2]
+; CHECK-NEXT:    ret
+  %add = add nuw i16 %arg1, 15
+  %mul = mul nuw nsw i16 %add, 3
+  %cmp = icmp ult i16 %mul, %arg2
+  %conv = zext i1 %cmp to i8
+  store i8 %conv, i8* %res, align 1
+  ret i16 %arg1
+}
+
+define i16 @signext_bitcast_phi_select(i16 signext %start, i16* %in) {
+; CHECK-LABEL: signext_bitcast_phi_select:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w9, #-1
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    cmp w9, w0, sxth
+; CHECK-NEXT:    b.lt .LBB6_3
+; CHECK-NEXT:  .LBB6_1: // %if.then
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sxth x8, w0
+; CHECK-NEXT:    ldrh w8, [x1, x8, lsl #1]
+; CHECK-NEXT:    cmp w8, w0, uxth
+; CHECK-NEXT:    b.eq .LBB6_4
+; CHECK-NEXT:  // %bb.2: // %if.else
+; CHECK-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    and w8, w8, #0x8000
+; CHECK-NEXT:    add w0, w0, w8, lsr #15
+; CHECK-NEXT:    cmp w9, w0, sxth
+; CHECK-NEXT:    b.ge .LBB6_1
+; CHECK-NEXT:  .LBB6_3:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB6_4: // %exit
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %const = bitcast i16 -1 to i16
+  br label %for.body
+
+for.body:                                         ; preds = %if.else, %entry
+  %idx = phi i16 [ %select, %if.else ], [ %start, %entry ]
+  %cmp.i = icmp sgt i16 %idx, %const
+  br i1 %cmp.i, label %exit, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %idx.next = getelementptr i16, i16* %in, i16 %idx
+  %ld = load i16, i16* %idx.next, align 2
+  %cmp1.i = icmp eq i16 %ld, %idx
+  br i1 %cmp1.i, label %exit, label %if.else
+
+if.else:                                          ; preds = %if.then
+  %lobit = lshr i16 %idx, 15
+  %lobit.not = xor i16 %lobit, 1
+  %select = add nuw i16 %lobit.not, %idx
+  br label %for.body
+
+exit:                                             ; preds = %if.then, %for.body
+  %res = phi i16 [ %ld, %if.then ], [ 0, %for.body ]
+  ret i16 %res
+}

diff  --git a/llvm/test/CodeGen/AArch64/typepromotion-signed.ll b/llvm/test/CodeGen/AArch64/typepromotion-signed.ll
new file mode 100644
index 000000000000..e59f409c0187
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/typepromotion-signed.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+define i16 @test_signed_load(i16* nocapture readonly %ptr) {
+; CHECK-LABEL: test_signed_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsh w8, [x0]
+; CHECK-NEXT:    tst w8, #0xffff0000
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %load = load i16, i16* %ptr, align 2
+  %conv0 = zext i16 %load to i32
+  %conv1 = sext i16 %load to i32
+  %cmp = icmp eq i32 %conv0, %conv1
+  %conv2 = zext i1 %cmp to i16
+  ret i16 %conv2
+}
+
+define i16 @test_ashr(i16 zeroext %arg) local_unnamed_addr #1 {
+; CHECK-LABEL: test_ashr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, #2
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %arg, 2
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+define i16 @test_sdiv(i16 zeroext %arg) local_unnamed_addr #1 {
+; CHECK-LABEL: test_sdiv:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, #1
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    cmp w8, #2
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %arg.off = add i16 %arg, 1
+  %1 = icmp ugt i16 %arg.off, 2
+  %conv = zext i1 %1 to i16
+  ret i16 %conv
+}
+
+define i16 @test_srem(i16 zeroext %arg) local_unnamed_addr #1 {
+; CHECK-LABEL: test_srem:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x3
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %1 = and i16 %arg, 3
+  %cmp = icmp ne i16 %1, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+define i32 @test_signext_b(i8* nocapture readonly %ptr, i8 signext %arg) {
+; CHECK-LABEL: test_signext_b:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    mov w9, #20894
+; CHECK-NEXT:    add w8, w8, w1
+; CHECK-NEXT:    sxtb w8, w8
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    csel w0, w8, w9, ge
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, i8* %ptr, align 1
+  %1 = add nuw nsw i8 %0, %arg
+  %cmp = icmp sgt i8 %1, -1
+  %res = select i1 %cmp, i32 42, i32 20894
+  ret i32 %res
+}
+
+define i32 @test_signext_b_ult_slt(i8* nocapture readonly %ptr, i8 signext %arg) {
+; CHECK-LABEL: test_signext_b_ult_slt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    mov w9, #57
+; CHECK-NEXT:    add w10, w8, w1
+; CHECK-NEXT:    and w10, w10, #0xff
+; CHECK-NEXT:    cmp w10, #127
+; CHECK-NEXT:    ccmp w8, #0, #0, ne
+; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    csel w0, w8, w9, eq
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, i8* %ptr, align 1
+  %1 = add nuw nsw i8 %0, %arg
+  %cmp = icmp ne i8 %1, 127
+  %cmp.1 = icmp eq i8 %0, 0
+  %or = and i1 %cmp.1, %cmp
+  %res = select i1 %or, i32 42, i32 57
+  ret i32 %res
+}
+
+define i32 @test_signext_h(i16* nocapture readonly %ptr, i16 signext %arg) {
+; CHECK-LABEL: test_signext_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    mov w9, #20894
+; CHECK-NEXT:    add w8, w8, w1
+; CHECK-NEXT:    sxth w8, w8
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    csel w0, w8, w9, ge
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i16, i16* %ptr, align 1
+  %1 = add nuw nsw i16 %0, %arg
+  %cmp = icmp sgt i16 %1, -1
+  %res = select i1 %cmp, i32 42, i32 20894
+  ret i32 %res
+}


        


More information about the llvm-commits mailing list