[clang] [clang-tools-extra] [llvm] Add out-of-line-atomics support to GlobalISel (PR #74588)

Thomas Preud'homme via cfe-commits cfe-commits at lists.llvm.org
Mon Dec 18 03:31:21 PST 2023


https://github.com/RoboTux updated https://github.com/llvm/llvm-project/pull/74588

>From 57e9965465c921b137051b46c1d3e5e245f9cd38 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Fri, 1 Dec 2023 12:01:52 +0000
Subject: [PATCH 1/8] Add out-of-line-atomics support to GlobalISel

This patch implement the GlobalISel counterpart to
4d7df43ffdb460dddb2877a886f75f45c3fee188.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  183 ++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   29 +-
 .../aarch64-atomic-load-outline_atomics.ll    |   48 +-
 .../aarch64-atomic-store-outline_atomics.ll   |   48 +-
 .../aarch64-atomicrmw-outline_atomics.ll      | 2380 +++++++----------
 .../aarch64-cmpxchg-outline_atomics.ll        | 1683 +++---------
 .../AArch64/GlobalISel/arm64-atomic-128.ll    |  239 ++
 .../AArch64/GlobalISel/arm64-atomic.ll        | 1367 ++++++++++
 .../GlobalISel/legalizer-info-validation.mir  |    2 -
 9 files changed, 3157 insertions(+), 2822 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 045fc78218daef..186937e597c5bc 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -765,6 +765,166 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   return LegalizerHelper::Legalized;
 }
 
+static RTLIB::Libcall
+getOutlineAtomicLibcall(unsigned Opc, AtomicOrdering Order, uint64_t MemSize) {
+  unsigned ModeN, ModelN;
+  switch (MemSize) {
+  case 1:
+    ModeN = 0;
+    break;
+  case 2:
+    ModeN = 1;
+    break;
+  case 4:
+    ModeN = 2;
+    break;
+  case 8:
+    ModeN = 3;
+    break;
+  case 16:
+    ModeN = 4;
+    break;
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+
+  switch (Order) {
+  case AtomicOrdering::Monotonic:
+    ModelN = 0;
+    break;
+  case AtomicOrdering::Acquire:
+    ModelN = 1;
+    break;
+  case AtomicOrdering::Release:
+    ModelN = 2;
+    break;
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
+    ModelN = 3;
+    break;
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+
+#define LCALLS(A, B)                                                           \
+  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALL5(A)                                                              \
+  LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_ADD: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_AND: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_OR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
+    return LC[ModeN][ModelN];
+  }
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+#undef LCALLS
+#undef LCALL5
+}
+
+static LegalizerHelper::LegalizeResult
+createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+
+  // Add all the args, except for the last which is an imm denoting 'tail'.
+  // const CallLowering::ArgInfo &Result,
+  // Operand 0 & 1 are return: 0 is old val, 1 is success, 2-4 are reg operands:
+  // 2 is ptr, 3 is expected, 4 is new
+  Type *RetTy;
+  SmallVector<Register> RetRegs;
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    Register Success;
+    LLT SuccessLLT;
+    auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
+        MI.getFirst4RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
+      std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
+               NewLLT) = MI.getFirst5RegLLTs();
+      RetRegs.push_back(Success);
+      RetTy = StructType::get(
+          Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
+    }
+    Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
+    Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMICRMW_AND) {
+      Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
+      MIRBuilder.buildXor(Tmp, MIRBuilder.buildConstant(ValLLT, -1), Val);
+      Val = Tmp;
+    }
+    Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  default:
+    llvm_unreachable("unsupported opcode");
+  }
+
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  auto &AtomicMI = cast<GMemOperation>(MI);
+  auto Ordering = AtomicMI.getMMO().getMergedOrdering();
+  uint64_t MemSize = AtomicMI.getMemSize();
+  RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(Opc, Ordering, MemSize);
+  const char *Name = TLI.getLibcallName(RTLibcall);
+
+  // Unsupported libcall on the target.
+  if (!Name) {
+    LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
+                      << MIRBuilder.getTII().getName(Opc) << "\n");
+    return LegalizerHelper::UnableToLegalize;
+  }
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
+
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  return LegalizerHelper::Legalized;
+}
+
 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
                                        Type *FromType) {
   auto ToMVT = MVT::getVT(ToType);
@@ -1020,6 +1180,18 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR:
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    auto Status = createAtomicLibcall(MIRBuilder, MI);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
   case TargetOpcode::G_BZERO:
   case TargetOpcode::G_MEMCPY:
   case TargetOpcode::G_MEMMOVE:
@@ -3793,6 +3965,17 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerTRUNC(MI);
   GISEL_VECREDUCE_CASES_NONSEQ
     return lowerVectorReduction(MI);
+  case G_ATOMICRMW_SUB: {
+    auto Val = MI.getOperand(2).getReg();
+    LLT ValLLT = MRI.getType(Val);
+    Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
+    MIRBuilder.buildSub(Tmp, MIRBuilder.buildConstant(ValLLT, 0), Val);
+    auto [Ret, Mem] = MI.getFirst2Regs();
+    auto &MMO = cast<GMemOperation>(MI).getMMO();
+    MIRBuilder.buildAtomicRMWAdd(Ret, Mem, Tmp, MMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 21a412e9360dce..7fce3e501db57c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -758,16 +758,39 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
 
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
+      .libcallIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
       .customIf([](const LegalityQuery &Query) {
         return Query.Types[0].getSizeInBits() == 128;
       })
       .clampScalar(0, s32, s64)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
+  getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
+                               G_ATOMICRMW_AND, G_ATOMICRMW_OR,
+                               G_ATOMICRMW_XOR})
+      .libcallIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
+      .clampScalar(0, s32, s64)
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+
+  getActionDefinitionsBuilder(G_ATOMICRMW_SUB)
+      .lowerIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
+      .clampScalar(0, s32, s64)
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+
+  // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
+  // Don't outline them unless
+  // (1) high level <atomic> support approved:
+  //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
+  // (2) low level libgcc and compiler-rt support implemented by:
+  //   min/max outline atomics helpers
   getActionDefinitionsBuilder(
-      {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
-       G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
-       G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
+      {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
       .clampScalar(0, s32, s64)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
index fb4bef33d9b4ff..fccafb29addbc3 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
@@ -229,11 +229,7 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered:
 ; -O1:    ldxp x0, x1, [x8]
@@ -244,11 +240,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -259,11 +251,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
@@ -274,11 +262,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -289,11 +273,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -304,11 +284,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire_const:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -319,11 +295,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr)
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -334,11 +306,7 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const:
 ; -O1:    ldaxp x0, x1, [x8]
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
index 3d204b734d4a03..e594561010464b 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
@@ -117,14 +117,10 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_unordered:
@@ -136,14 +132,10 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_monotonic:
@@ -155,14 +147,10 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_release:
@@ -174,14 +162,10 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
index c660c139e35d44..e9b096e8c6c44b 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
@@ -145,14 +145,10 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
@@ -164,14 +160,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire:
@@ -183,14 +175,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release:
@@ -202,14 +190,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
@@ -221,14 +205,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
@@ -555,16 +535,12 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic:
@@ -577,16 +553,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire:
@@ -599,16 +571,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_release:
@@ -621,16 +589,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel:
@@ -643,16 +607,12 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
-; -O0:    adds x14, x8, x10
+; -O0:    adds x2, x8, x10
 ; -O0:    subs w10, w10, #1
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst:
@@ -1170,15 +1130,11 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
-; -O0:    subs x14, x8, x10
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic:
@@ -1191,15 +1147,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
-; -O0:    subs x14, x8, x10
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire:
@@ -1212,15 +1164,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
-; -O0:    subs x14, x8, x10
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_release:
@@ -1233,15 +1181,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
-; -O0:    subs x14, x8, x10
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
@@ -1254,15 +1198,11 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
-; -O0:    subs x14, x8, x10
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    subs x2, x8, x10
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
@@ -1575,7 +1515,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_and_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_monotonic:
@@ -1587,7 +1527,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_acquire:
@@ -1599,7 +1539,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_release:
@@ -1611,7 +1551,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_acq_rel:
@@ -1623,7 +1563,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_aligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_aligned_seq_cst:
@@ -1635,7 +1575,7 @@ define dso_local i8 @atomicrmw_and_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_monotonic:
@@ -1647,7 +1587,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_and_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_acquire:
@@ -1659,7 +1599,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_release:
@@ -1671,7 +1611,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_acq_rel:
@@ -1683,7 +1623,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_and_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_and_i16_aligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr2_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i16_aligned_seq_cst:
@@ -1695,7 +1635,7 @@ define dso_local i16 @atomicrmw_and_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_monotonic:
@@ -1707,7 +1647,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_and_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_acquire:
@@ -1719,7 +1659,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_release:
@@ -1731,7 +1671,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_acq_rel:
@@ -1743,7 +1683,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_and_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_and_i32_aligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr4_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i32_aligned_seq_cst:
@@ -1755,7 +1695,7 @@ define dso_local i32 @atomicrmw_and_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_monotonic:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_monotonic:
@@ -1767,7 +1707,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_acquire:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_acquire:
@@ -1779,7 +1719,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_release:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_release:
@@ -1791,7 +1731,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_acq_rel:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_acq_rel:
@@ -1803,7 +1743,7 @@ define dso_local i64 @atomicrmw_and_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_and_i64_aligned_seq_cst:
-; -O0:    mvn x0, x8
+; -O0:    eor x0, x8, x9
 ; -O0:    bl __aarch64_ldclr8_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i64_aligned_seq_cst:
@@ -1815,16 +1755,12 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
@@ -1838,16 +1774,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
@@ -1861,16 +1793,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_release:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
@@ -1884,16 +1812,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
@@ -1907,16 +1831,12 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst:
-; -O0:    and x14, x8, x10
-; -O0:    and x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    and x2, x8, x10
+; -O0:    and x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
@@ -1930,7 +1850,7 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_monotonic:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_relax
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_monotonic:
@@ -1942,7 +1862,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_acquire:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_acquire:
@@ -1954,7 +1874,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_release:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_release:
@@ -1966,7 +1886,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_acq_rel:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_acq_rel:
@@ -1978,7 +1898,7 @@ define dso_local i8 @atomicrmw_and_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_and_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_and_i8_unaligned_seq_cst:
-; -O0:    mvn w0, w8
+; -O0:    eor w0, w8, w9
 ; -O0:    bl __aarch64_ldclr1_acq_rel
 ;
 ; -O1-LABEL: atomicrmw_and_i8_unaligned_seq_cst:
@@ -2245,13 +2165,11 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_monotonic:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_monotonic:
 ; -O1:    ldxrb w8, [x0]
@@ -2264,13 +2182,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_acquire:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_acquire:
 ; -O1:    ldaxrb w8, [x0]
@@ -2283,13 +2199,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_release:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_release:
 ; -O1:    ldxrb w8, [x0]
@@ -2302,13 +2216,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_acq_rel:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_acq_rel:
 ; -O1:    ldaxrb w8, [x0]
@@ -2321,13 +2233,11 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_aligned_seq_cst:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_aligned_seq_cst:
 ; -O1:    ldaxrb w8, [x0]
@@ -2340,12 +2250,10 @@ define dso_local i8 @atomicrmw_nand_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_monotonic:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_monotonic:
 ; -O1:    ldxrh w8, [x0]
@@ -2358,12 +2266,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_acquire:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_acquire:
 ; -O1:    ldaxrh w8, [x0]
@@ -2376,12 +2282,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_release:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_release:
 ; -O1:    ldxrh w8, [x0]
@@ -2394,12 +2298,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_acq_rel:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_acq_rel:
 ; -O1:    ldaxrh w8, [x0]
@@ -2412,12 +2314,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_nand_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_nand_i16_aligned_seq_cst:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_aligned_seq_cst:
 ; -O1:    ldaxrh w8, [x0]
@@ -2430,12 +2330,10 @@ define dso_local i16 @atomicrmw_nand_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_monotonic:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -2448,12 +2346,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_acquire:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -2466,12 +2362,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_release:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -2484,12 +2378,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_acq_rel:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -2502,12 +2394,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_nand_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_nand_i32_aligned_seq_cst:
-; -O0:    and w9, w8, w9
-; -O0:    mvn w12, w9
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -2520,12 +2410,10 @@ define dso_local i32 @atomicrmw_nand_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_monotonic:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -2538,12 +2426,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_acquire:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -2556,12 +2442,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_release:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -2574,12 +2458,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_acq_rel:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -2592,12 +2474,10 @@ define dso_local i64 @atomicrmw_nand_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_nand_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_nand_i64_aligned_seq_cst:
-; -O0:    and x9, x8, x9
-; -O0:    mvn x12, x9
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    and x8, x0, x8
+; -O0:    mvn x1, x8
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -2612,16 +2492,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
@@ -2639,16 +2515,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
@@ -2666,16 +2538,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
@@ -2693,16 +2561,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
@@ -2720,16 +2584,12 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O0:    and x9, x8, x9
 ; -O0:    and x8, x8, x10
-; -O0:    mvn x14, x9
-; -O0:    mvn x15, x8
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    mvn x2, x9
+; -O0:    mvn x3, x8
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
@@ -2745,13 +2605,11 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_monotonic:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_monotonic:
 ; -O1:    ldxrb w8, [x0]
@@ -2764,13 +2622,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_acquire:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_acquire:
 ; -O1:    ldaxrb w8, [x0]
@@ -2783,13 +2639,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_release:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_release:
 ; -O1:    ldxrb w8, [x0]
@@ -2802,13 +2656,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_acq_rel:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_acq_rel:
 ; -O1:    ldaxrb w8, [x0]
@@ -2821,13 +2673,11 @@ define dso_local i8 @atomicrmw_nand_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_nand_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_nand_i8_unaligned_seq_cst:
-; -O0:    and w8, w10, w8
-; -O0:    mvn w12, w8
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    and w8, w0, w8
+; -O0:    mvn w1, w8
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_nand_i8_unaligned_seq_cst:
 ; -O1:    ldaxrb w8, [x0]
@@ -3285,16 +3135,12 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
@@ -3308,16 +3154,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
@@ -3331,16 +3173,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_release:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
@@ -3354,16 +3192,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
@@ -3377,16 +3211,12 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst:
-; -O0:    orr x14, x8, x10
-; -O0:    orr x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    orr x2, x8, x10
+; -O0:    orr x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
@@ -3830,16 +3660,12 @@ define dso_local i64 @atomicrmw_xor_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
@@ -3853,16 +3679,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
@@ -3876,16 +3698,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_release:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
@@ -3899,16 +3717,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
@@ -3922,16 +3736,12 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 
 define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
-; -O0:    eor x14, x8, x10
-; -O0:    eor x15, x8, x9
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    eor x2, x8, x10
+; -O0:    eor x3, x8, x9
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
@@ -4235,14 +4045,12 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -4256,14 +4064,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -4277,14 +4083,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -4298,14 +4102,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -4319,14 +4121,12 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_aligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -4340,13 +4140,11 @@ define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_monotonic:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_monotonic:
 ; -O1:    ldxrh w9, [x0]
@@ -4360,13 +4158,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acquire:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_acquire:
 ; -O1:    ldaxrh w9, [x0]
@@ -4380,13 +4176,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_release:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_release:
 ; -O1:    ldxrh w9, [x0]
@@ -4400,13 +4194,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acq_rel:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_acq_rel:
 ; -O1:    ldaxrh w9, [x0]
@@ -4420,13 +4212,11 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_seq_cst:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_max_i16_aligned_seq_cst:
 ; -O1:    ldaxrh w9, [x0]
@@ -4440,12 +4230,10 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -4458,12 +4246,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -4476,12 +4262,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -4494,12 +4278,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -4512,12 +4294,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, gt
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_max_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -4530,12 +4310,10 @@ define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -4548,12 +4326,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -4566,12 +4342,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -4584,12 +4358,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -4602,12 +4374,10 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, gt
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, gt
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_max_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -4621,21 +4391,17 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic:
@@ -4651,21 +4417,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire:
@@ -4681,21 +4443,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_release:
@@ -4711,21 +4469,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel:
@@ -4741,21 +4495,17 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst:
@@ -4770,14 +4520,12 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -4791,14 +4539,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -4812,14 +4558,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -4833,14 +4577,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -4854,14 +4596,12 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_max_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, gt
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, gt
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -5205,14 +4945,12 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -5226,14 +4964,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -5247,14 +4983,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -5268,14 +5002,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -5289,14 +5021,12 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_aligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -5310,13 +5040,11 @@ define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_monotonic:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_monotonic:
 ; -O1:    ldxrh w9, [x0]
@@ -5330,13 +5058,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acquire:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_acquire:
 ; -O1:    ldaxrh w9, [x0]
@@ -5350,13 +5076,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_release:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_release:
 ; -O1:    ldxrh w9, [x0]
@@ -5370,13 +5094,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acq_rel:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_acq_rel:
 ; -O1:    ldaxrh w9, [x0]
@@ -5390,13 +5112,11 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_seq_cst:
-; -O0:    sxth w10, w8
-; -O0:    subs w10, w10, w9, sxth
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    sxth w9, w0
+; -O0:    subs w9, w9, w8, sxth
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_min_i16_aligned_seq_cst:
 ; -O1:    ldaxrh w9, [x0]
@@ -5410,12 +5130,10 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -5428,12 +5146,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -5446,12 +5162,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -5464,12 +5178,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -5482,12 +5194,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, le
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_min_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -5500,12 +5210,10 @@ define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -5518,12 +5226,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -5536,12 +5242,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -5554,12 +5258,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -5572,12 +5274,10 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, le
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, le
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_min_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -5591,21 +5291,17 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic:
@@ -5621,21 +5317,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire:
@@ -5651,21 +5343,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_release:
@@ -5681,21 +5369,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel:
@@ -5711,21 +5395,17 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst:
@@ -5740,14 +5420,12 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_monotonic:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_monotonic:
 ; -O1:    ldxrb w9, [x0]
@@ -5761,14 +5439,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acquire:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_acquire:
 ; -O1:    ldaxrb w9, [x0]
@@ -5782,14 +5458,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_release:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_release:
 ; -O1:    ldxrb w9, [x0]
@@ -5803,14 +5477,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
 ; -O1:    ldaxrb w9, [x0]
@@ -5824,14 +5496,12 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_min_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
-; -O0:    sxtb w9, w10
+; -O0:    sxtb w9, w0
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    csel w12, w10, w8, le
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, le
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
 ; -O1:    ldaxrb w9, [x0]
@@ -6175,14 +5845,12 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -6196,14 +5864,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -6217,14 +5883,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -6238,14 +5902,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -6259,14 +5921,12 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
@@ -6280,12 +5940,10 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_monotonic:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_monotonic:
 ; -O1:    and w9, w1, #0xffff
@@ -6299,12 +5957,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acquire:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_acquire:
 ; -O1:    and w9, w1, #0xffff
@@ -6318,12 +5974,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_release:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_release:
 ; -O1:    and w9, w1, #0xffff
@@ -6337,12 +5991,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xffff
@@ -6356,12 +6008,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xffff
@@ -6375,12 +6025,10 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -6393,12 +6041,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -6411,12 +6057,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -6429,12 +6073,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -6447,12 +6089,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, hi
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -6465,12 +6105,10 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -6483,12 +6121,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -6501,12 +6137,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -6519,12 +6153,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -6537,12 +6169,10 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, hi
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, hi
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -6556,21 +6186,17 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic:
@@ -6586,21 +6212,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire:
@@ -6616,21 +6238,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_release:
@@ -6646,21 +6264,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
@@ -6676,21 +6290,17 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
@@ -6705,14 +6315,12 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -6726,14 +6334,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -6747,14 +6353,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -6768,14 +6372,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -6789,14 +6391,12 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, hi
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, hi
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
@@ -7135,14 +6735,12 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -7156,14 +6754,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -7177,14 +6773,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -7198,14 +6792,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -7219,14 +6811,12 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
@@ -7240,12 +6830,10 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_monotonic:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_relax
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_monotonic:
 ; -O1:    and w9, w1, #0xffff
@@ -7259,12 +6847,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value)
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acquire:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_acq
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_acquire:
 ; -O1:    and w9, w1, #0xffff
@@ -7278,12 +6864,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_release:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_release:
 ; -O1:    and w9, w1, #0xffff
@@ -7297,12 +6881,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
 ; -O1:    and w9, w1, #0xffff
@@ -7316,12 +6898,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 
 define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
-; -O0:    subs w10, w10, w9, uxth
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxrh w9, [x11]
-; -O0:    cmp w9, w8, uxth
-; -O0:    stlxrh w10, w12, [x11]
-; -O0:    subs w8, w8, w9, uxth
+; -O0:    subs w9, w9, w8, uxth
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas2_acq_rel
+; -O0:    subs w8, w8, w0, uxth
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
 ; -O1:    and w9, w1, #0xffff
@@ -7335,12 +6915,10 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_monotonic:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_relax
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_monotonic:
 ; -O1:    ldxr w8, [x0]
@@ -7353,12 +6931,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value)
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acquire:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_acq
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_acquire:
 ; -O1:    ldaxr w8, [x0]
@@ -7371,12 +6947,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_release:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_release:
 ; -O1:    ldxr w8, [x0]
@@ -7389,12 +6963,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
 ; -O1:    ldaxr w8, [x0]
@@ -7407,12 +6979,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 
 define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
-; -O0:    subs w10, w8, w9
-; -O0:    csel w12, w8, w9, ls
-; -O0:    ldaxr w9, [x11]
-; -O0:    cmp w9, w8
-; -O0:    stlxr w10, w12, [x11]
-; -O0:    subs w8, w9, w8
+; -O0:    subs w9, w0, w8
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas4_acq_rel
+; -O0:    subs w8, w0, w8
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
 ; -O1:    ldaxr w8, [x0]
@@ -7425,12 +6995,10 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_monotonic:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_relax
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_monotonic:
 ; -O1:    ldxr x0, [x8]
@@ -7443,12 +7011,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value)
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acquire:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_acq
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_acquire:
 ; -O1:    ldaxr x0, [x8]
@@ -7461,12 +7027,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_release:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_release:
 ; -O1:    ldxr x0, [x8]
@@ -7479,12 +7043,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
 ; -O1:    ldaxr x0, [x8]
@@ -7497,12 +7059,10 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 
 define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
-; -O0:    subs x10, x8, x9
-; -O0:    csel x12, x8, x9, ls
-; -O0:    ldaxr x9, [x11]
-; -O0:    cmp x9, x8
-; -O0:    stlxr w10, x12, [x11]
-; -O0:    subs x8, x9, x8
+; -O0:    subs x9, x0, x8
+; -O0:    csel x1, x0, x8, ls
+; -O0:    bl __aarch64_cas8_acq_rel
+; -O0:    subs x8, x0, x8
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
 ; -O1:    ldaxr x0, [x8]
@@ -7516,21 +7076,17 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic:
@@ -7546,21 +7102,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire:
@@ -7576,21 +7128,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_release:
@@ -7606,21 +7154,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
@@ -7636,21 +7180,17 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x12
-; -O0:    subs x13, x13, x9
+; -O0:    subs x8, x8, x11
+; -O0:    subs x12, x12, x9
 ; -O0:    csel w10, w8, w10, eq
-; -O0:    ands w13, w10, #0x1
-; -O0:    csel x14, x8, x12, ne
+; -O0:    ands w12, w10, #0x1
+; -O0:    csel x2, x8, x11, ne
 ; -O0:    ands w10, w10, #0x1
-; -O0:    csel x15, x8, x9, ne
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    csel x3, x8, x9, ne
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
@@ -7665,14 +7205,12 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_relax
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
 ; -O1:    and w9, w1, #0xff
@@ -7686,14 +7224,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acquire:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_acquire:
 ; -O1:    and w9, w1, #0xff
@@ -7707,14 +7243,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_release:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_release:
 ; -O1:    and w9, w1, #0xff
@@ -7728,14 +7262,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
 ; -O1:    and w9, w1, #0xff
@@ -7749,14 +7281,12 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 
 define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
-; -O0:    and w9, w10, #0xff
+; -O0:    and w9, w0, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    csel w12, w10, w8, ls
-; -O0:    ldaxrb w9, [x11]
-; -O0:    cmp w9, w10, uxtb
-; -O0:    stlxrb w8, w12, [x11]
-; -O0:    and w8, w9, #0xff
-; -O0:    subs w8, w8, w10, uxtb
+; -O0:    csel w1, w0, w8, ls
+; -O0:    bl __aarch64_cas1_acq_rel
+; -O0:    and w8, w0, #0xff
+; -O0:    subs w8, w8, w9, uxtb
 ;
 ; -O1-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
 ; -O1:    and w9, w1, #0xff
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll
index 403e4770e17f9c..86c040cc359359 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll
@@ -4,2400 +4,1440 @@
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+outline-atomics -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acquire_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_release_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_release_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_acq_rel_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_aligned_seq_cst_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas2_relax
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas2_relax
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new monotonic monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas2_relax
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_relax
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new monotonic monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new monotonic acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new monotonic acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new monotonic seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_monotonic_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new monotonic seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acquire monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acquire monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acquire acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acquire acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acquire seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acquire_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acquire seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas2_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas2_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new release monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas2_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new release monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_acquire:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new release acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new release acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new release seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_release_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_release_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new release seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acq_rel monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acq_rel monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acq_rel acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acq_rel acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new acq_rel seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_acq_rel_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new acq_rel seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_monotonic(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new seq_cst monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_monotonic_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new seq_cst monotonic, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_acquire(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_acquire:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new seq_cst acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_acquire_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new seq_cst acquire, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_seq_cst(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg ptr %ptr, i16 %expected, i16 %new seq_cst seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i16 @cmpxchg_i16_aligned_seq_cst_seq_cst_weak(i16 %expected, i16 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxrh w0, [x2]
-; -O0:    cmp w0, w9, uxth
-; -O0:    stlxrh w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas2_acq_rel
+; CHECK-LABEL: cmpxchg_i16_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas2_acq_rel
     %pair = cmpxchg weak ptr %ptr, i16 %expected, i16 %new seq_cst seq_cst, align 2
     %r = extractvalue { i16, i1 } %pair, 0
     ret i16 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas4_relax
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas4_relax
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new monotonic monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas4_relax
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_relax
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new monotonic monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new monotonic acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new monotonic acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new monotonic seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_monotonic_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new monotonic seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acquire monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acquire monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acquire acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acquire acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acquire seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acquire_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acquire seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas4_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas4_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new release monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas4_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new release monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_acquire:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new release acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new release acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new release seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_release_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_release_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new release seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acq_rel monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acq_rel monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acq_rel acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acq_rel acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new acq_rel seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_acq_rel_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new acq_rel seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_monotonic(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new seq_cst monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_monotonic_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new seq_cst monotonic, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_acquire(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_acquire:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new seq_cst acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_acquire_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new seq_cst acquire, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_seq_cst(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg ptr %ptr, i32 %expected, i32 %new seq_cst seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i32 @cmpxchg_i32_aligned_seq_cst_seq_cst_weak(i32 %expected, i32 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxr w0, [x2]
-; -O0:    cmp w0, w9
-; -O0:    stlxr w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas4_acq_rel
+; CHECK-LABEL: cmpxchg_i32_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas4_acq_rel
     %pair = cmpxchg weak ptr %ptr, i32 %expected, i32 %new seq_cst seq_cst, align 4
     %r = extractvalue { i32, i1 } %pair, 0
     ret i32 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas8_relax
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas8_relax
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new monotonic monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas8_relax
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_relax
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new monotonic monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new monotonic acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new monotonic acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new monotonic seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_monotonic_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new monotonic seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acquire monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acquire monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acquire acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acquire acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acquire seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acquire_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acquire seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas8_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas8_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new release monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas8_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new release monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_acquire:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new release acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new release acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new release seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_release_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_release_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new release seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acq_rel monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acq_rel monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acq_rel acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acq_rel acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new acq_rel seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_acq_rel_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new acq_rel seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_monotonic(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new seq_cst monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_monotonic_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new seq_cst monotonic, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_acquire(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_acquire:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new seq_cst acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_acquire_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new seq_cst acquire, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_seq_cst(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg ptr %ptr, i64 %expected, i64 %new seq_cst seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i64 @cmpxchg_i64_aligned_seq_cst_seq_cst_weak(i64 %expected, i64 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxr x0, [x2]
-; -O0:    cmp x0, x9
-; -O0:    stlxr w8, x1, [x2]
-;
-; -O1-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas8_acq_rel
+; CHECK-LABEL: cmpxchg_i64_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas8_acq_rel
     %pair = cmpxchg weak ptr %ptr, i64 %expected, i64 %new seq_cst seq_cst, align 8
     %r = extractvalue { i64, i1 } %pair, 0
     ret i64 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_monotonic:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas16_relax
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas16_relax
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new monotonic monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_monotonic_weak:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas16_relax
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_relax
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new monotonic monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new monotonic acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new monotonic acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new monotonic seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_monotonic_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new monotonic seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_monotonic:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acquire monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_monotonic_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acquire monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_acquire:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acquire acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x2, x3, [x4]
-; -O0:    stxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acquire acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acquire seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acquire_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acquire_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acquire seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_monotonic:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_monotonic:
-; -O1:    bl __aarch64_cas16_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_monotonic:
+; CHECK:    bl __aarch64_cas16_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new release monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_monotonic_weak:
-; -O0:    ldxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas16_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new release monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_acquire:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_acquire:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new release acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new release acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new release seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_release_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_release_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new release seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acq_rel monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acq_rel monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acq_rel acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acq_rel acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new acq_rel seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_acq_rel_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new acq_rel seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_monotonic(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new seq_cst monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_monotonic_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new seq_cst monotonic, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_acquire(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_acquire:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new seq_cst acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_acquire_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_acquire_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new seq_cst acquire, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_seq_cst(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg ptr %ptr, i128 %expected, i128 %new seq_cst seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i128 @cmpxchg_i128_aligned_seq_cst_seq_cst_weak(i128 %expected, i128 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxp x0, x1, [x4]
-; -O0:    cmp x0, x9
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x2, x3, [x4]
-; -O0:    stlxp w8, x0, x1, [x4]
-;
-; -O1-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas16_acq_rel
+; CHECK-LABEL: cmpxchg_i128_aligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas16_acq_rel
     %pair = cmpxchg weak ptr %ptr, i128 %expected, i128 %new seq_cst seq_cst, align 16
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic_weak:
-; -O1:    bl __aarch64_cas1_relax
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_relax
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_monotonic_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_monotonic_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new monotonic seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_monotonic:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_monotonic:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_acquire:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_acquire:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acquire_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acquire_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acquire seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_monotonic:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_monotonic:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_monotonic_weak:
-; -O1:    bl __aarch64_cas1_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_release_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_release_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_release_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_release_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new release seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_acq_rel_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_acq_rel_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new acq_rel seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_monotonic(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_monotonic_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_monotonic_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst monotonic, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_acquire(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_acquire_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_acquire_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst acquire, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_seq_cst(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
 }
 
 define dso_local i8 @cmpxchg_i8_unaligned_seq_cst_seq_cst_weak(i8 %expected, i8 %new, ptr %ptr) {
-; -O0-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst_weak:
-; -O0:    ldaxrb w0, [x2]
-; -O0:    cmp w0, w9, uxtb
-; -O0:    stlxrb w8, w1, [x2]
-;
-; -O1-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst_weak:
-; -O1:    bl __aarch64_cas1_acq_rel
+; CHECK-LABEL: cmpxchg_i8_unaligned_seq_cst_seq_cst_weak:
+; CHECK:    bl __aarch64_cas1_acq_rel
     %pair = cmpxchg weak ptr %ptr, i8 %expected, i8 %new seq_cst seq_cst, align 1
     %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
@@ -3362,3 +2402,6 @@ define dso_local i128 @cmpxchg_i128_unaligned_seq_cst_seq_cst_weak(i128 %expecte
     %r = extractvalue { i128, i1 } %pair, 0
     ret i128 %r
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index a3d8531f5c7659..1fe63c9be8c629 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O1
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-OUTLINE-LLSC-O1
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse,+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O0
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-OUTLINE-LLSC-O0
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse,+outline-atomics -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
 @var = global i128 0
 
 define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
@@ -28,6 +32,25 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -63,6 +86,29 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -113,6 +159,25 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -148,6 +213,29 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_monotonic_seqcst:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_monotonic_seqcst:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -198,6 +286,25 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_release_acquire:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_release_acquire:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -233,6 +340,29 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_release_acquire:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_release_acquire:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -283,6 +413,25 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
 ; CHECK-LLSC-O1-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w19, -8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x19, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_monotonic:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -318,6 +467,29 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
 ; CHECK-LLSC-O0-NEXT:    str q0, [x0]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_monotonic:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq_rel
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x8, x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x8
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x0]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_monotonic:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    sub sp, sp, #16
@@ -358,6 +530,19 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
 ; CHECK-LLSC-O1-NEXT:    str q0, [x3]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: atomic_load_relaxed:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:  .LBB4_1: // %atomicrmw.start
+; CHECK-OUTLINE-LLSC-O1-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldxp x9, x8, [x2]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stxp w10, x9, x8, [x2]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    cbnz w10, .LBB4_1
+; CHECK-OUTLINE-LLSC-O1-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x9
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x8
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x3]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: atomic_load_relaxed:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    mov x0, xzr
@@ -392,6 +577,28 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
 ; CHECK-LLSC-O0-NEXT:    str q0, [x3]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: atomic_load_relaxed:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x4, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x3, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, xzr
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_relax
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    // implicit-def: $q0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[0], x0
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov v0.d[1], x1
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str q0, [x3]
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: atomic_load_relaxed:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    mov x8, xzr
@@ -434,6 +641,21 @@ define i128 @val_compare_and_swap_return(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O1-NEXT:    mov x0, x8
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_return:
+; CHECK-OUTLINE-LLSC-O1:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O1-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x6, x0
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x6
+; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
+;
 ; CHECK-CAS-O1-LABEL: val_compare_and_swap_return:
 ; CHECK-CAS-O1:       // %bb.0:
 ; CHECK-CAS-O1-NEXT:    // kill: def $x2 killed $x2 killed $x2_x3 def $x2_x3
@@ -465,6 +687,23 @@ define i128 @val_compare_and_swap_return(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O0-NEXT:  .LBB5_4:
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-LLSC-O0-LABEL: val_compare_and_swap_return:
+; CHECK-OUTLINE-LLSC-O0:       // %bb.0:
+; CHECK-OUTLINE-LLSC-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-OUTLINE-LLSC-O0-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x0, x2
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x1, x3
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x2, x4
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x4, [sp, #8] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    mov x3, x5
+; CHECK-OUTLINE-LLSC-O0-NEXT:    bl __aarch64_cas16_acq
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-OUTLINE-LLSC-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-LLSC-O0-NEXT:    ret
+;
 ; CHECK-CAS-O0-LABEL: val_compare_and_swap_return:
 ; CHECK-CAS-O0:       // %bb.0:
 ; CHECK-CAS-O0-NEXT:    mov x8, x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index d03647f8b294ef..dd516e4a1e6c74 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-NOLSE,CHECK-NOLSE-O1
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-OUTLINE,CHECK-OUTLINE-O1
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-NOLSE,CHECK-NOLSE-O0
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-OUTLINE,CHECK-OUTLINE-O0
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -mcpu=apple-a13 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O1
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -mcpu=apple-a13 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O1
 ; RUN: llc < %s -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -mcpu=apple-a13 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O0
+; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+outline-atomics -global-isel -global-isel-abort=1 -mcpu=apple-a13 -O0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK-LSE-O0
 
 define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O1-LABEL: val_compare_and_swap:
@@ -24,6 +28,17 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -37,6 +52,19 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB0_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casa w1, w2, [x0]
@@ -75,6 +103,18 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_from_load:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    ldr w8, [x2]
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w8
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_from_load:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -89,6 +129,20 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB1_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_from_load:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    mov x8, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [x8]
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_from_load:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldr w8, [x2]
@@ -129,6 +183,17 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_rel:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_rel:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -142,6 +207,19 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB2_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_rel:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_rel:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casal w1, w2, [x0]
@@ -179,6 +257,17 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -192,6 +281,19 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-NOLSE-O0-NEXT:  LBB3_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    cas x1, x2, [x0]
@@ -229,6 +331,17 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_64_monotonic_seqcst:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_64_monotonic_seqcst:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -242,6 +355,19 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O0-NEXT:  LBB4_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_64_monotonic_seqcst:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_64_monotonic_seqcst:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casal x1, x2, [x0]
@@ -279,6 +405,17 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_64_release_acquire:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: val_compare_and_swap_64_release_acquire:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -292,6 +429,19 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new)
 ; CHECK-NOLSE-O0-NEXT:  LBB5_3:
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: val_compare_and_swap_64_release_acquire:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: val_compare_and_swap_64_release_acquire:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    casal x1, x2, [x0]
@@ -323,6 +473,19 @@ define i32 @fetch_and_nand(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: fetch_and_nand:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB6_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w9, w8, #0x7
+; CHECK-OUTLINE-O1-NEXT:    mvn w9, w9
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB6_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_nand:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -360,6 +523,35 @@ define i32 @fetch_and_nand(ptr %p) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: fetch_and_nand:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB6_1
+; CHECK-OUTLINE-O0-NEXT:  LBB6_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0x7
+; CHECK-OUTLINE-O0-NEXT:    mvn w1, w8
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB6_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB6_2
+; CHECK-OUTLINE-O0-NEXT:  LBB6_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: fetch_and_nand:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov x8, x0
@@ -418,6 +610,19 @@ define i64 @fetch_and_nand_64(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: fetch_and_nand_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB7_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and x9, x8, #0x7
+; CHECK-OUTLINE-O1-NEXT:    mvn x9, x9
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB7_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_nand_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -455,6 +660,35 @@ define i64 @fetch_and_nand_64(ptr %p) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: fetch_and_nand_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB7_1
+; CHECK-OUTLINE-O0-NEXT:  LBB7_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and x8, x0, #0x7
+; CHECK-OUTLINE-O0-NEXT:    mvn x1, x8
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB7_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB7_2
+; CHECK-OUTLINE-O0-NEXT:  LBB7_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: fetch_and_nand_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov x8, x0
@@ -513,6 +747,15 @@ define i32 @fetch_and_or(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: fetch_and_or:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-NEXT:    mov x1, x0
+; CHECK-OUTLINE-NEXT:    mov w0, #5 ; =0x5
+; CHECK-OUTLINE-NEXT:    bl ___aarch64_ldset4_acq_rel
+; CHECK-OUTLINE-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_or:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -578,6 +821,15 @@ define i64 @fetch_and_or_64(ptr %p) #0 {
 ; CHECK-NOLSE-O1-NEXT:    mov x0, x8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: fetch_and_or_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, #7 ; =0x7
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: fetch_and_or_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -614,6 +866,16 @@ define i64 @fetch_and_or_64(ptr %p) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: fetch_and_or_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x0
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #7 ; =0x7
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: fetch_and_or_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov w8, #7 ; =0x7
@@ -636,6 +898,11 @@ define void @acquire_fence() #0 {
 ; CHECK-NOLSE-NEXT:    dmb ishld
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: acquire_fence:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    dmb ishld
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: acquire_fence:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    dmb ishld
@@ -655,6 +922,11 @@ define void @release_fence() #0 {
 ; CHECK-NOLSE-NEXT:    dmb ish
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: release_fence:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    dmb ish
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: release_fence:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    dmb ish
@@ -674,6 +946,11 @@ define void @seq_cst_fence() #0 {
 ; CHECK-NOLSE-NEXT:    dmb ish
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: seq_cst_fence:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    dmb ish
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: seq_cst_fence:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    dmb ish
@@ -693,6 +970,11 @@ define i32 @atomic_load(ptr %p) #0 {
 ; CHECK-NOLSE-NEXT:    ldar w0, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: atomic_load:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar w0, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar w0, [x0]
@@ -719,6 +1001,18 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldrb w8, [x0, #4095]
+; CHECK-OUTLINE-O1-NEXT:    ldrb w9, [x0, w1, sxtw]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldurb w10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ldrb w9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldrb w9, [x0, #4095]
@@ -733,6 +1027,20 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, uxtb
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldrb w9, [x0, #4095]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, w1, sxtw
+; CHECK-OUTLINE-O0-NEXT:    ldrb w8, [x8]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, #256
+; CHECK-OUTLINE-O0-NEXT:    ldrb w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldrb w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldrb w8, [x0, #4095]
@@ -789,6 +1097,18 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldrh w8, [x0, #8190]
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x0, w1, sxtw #1]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldurh w10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldrh w9, [x0, #8190]
@@ -803,6 +1123,20 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, uxth
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldrh w9, [x0, #8190]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, w1, sxtw #1
+; CHECK-OUTLINE-O0-NEXT:    ldrh w8, [x8]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxth
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, #256
+; CHECK-OUTLINE-O0-NEXT:    ldrh w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9, uxth
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldrh w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, uxth
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldrh w8, [x0, #8190]
@@ -859,6 +1193,18 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldr w8, [x0, #16380]
+; CHECK-OUTLINE-O1-NEXT:    ldr w9, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldur w10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ldr w9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add w8, w8, w10
+; CHECK-OUTLINE-O1-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_32:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0, #16380]
@@ -871,6 +1217,18 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [x0, #16380]
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    ldur w9, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add w8, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_32:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldr w8, [x0, #16380]
@@ -925,6 +1283,18 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-NEXT:    add x0, x8, x9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_load_relaxed_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldr x8, [x0, #32760]
+; CHECK-OUTLINE-O1-NEXT:    ldr x9, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    ldur x10, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    add x8, x8, x9
+; CHECK-OUTLINE-O1-NEXT:    ldr x9, [x11]
+; CHECK-OUTLINE-O1-NEXT:    add x8, x8, x10
+; CHECK-OUTLINE-O1-NEXT:    add x0, x8, x9
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0, #32760]
@@ -937,6 +1307,18 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O0-NEXT:    add x0, x8, x9
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_load_relaxed_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [x0, #32760]
+; CHECK-OUTLINE-O0-NEXT:    ldr x9, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    ldur x9, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    add x9, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    ldr x9, [x9]
+; CHECK-OUTLINE-O0-NEXT:    add x0, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_load_relaxed_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldr x8, [x0, #32760]
@@ -986,6 +1368,12 @@ define void @atomc_store(ptr %p) #0 {
 ; CHECK-NOLSE-NEXT:    stlr w8, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: atomc_store:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    mov w8, #4 ; =0x4
+; CHECK-OUTLINE-NEXT:    stlr w8, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomc_store:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov w8, #4 ; =0x4
@@ -1011,6 +1399,15 @@ define void @atomic_store_relaxed_8(ptr %p, i32 %off32, i8 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    strb w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    strb w2, [x0, #4095]
+; CHECK-OUTLINE-O1-NEXT:    strb w2, [x0, w1, sxtw]
+; CHECK-OUTLINE-O1-NEXT:    sturb w2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    strb w2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    strb w2, [x0, #4095]
@@ -1020,6 +1417,15 @@ define void @atomic_store_relaxed_8(ptr %p, i32 %off32, i8 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    strb w2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    strb w2, [x0, #4095]
+; CHECK-OUTLINE-O0-NEXT:    strb w2, [x0, w1, sxtw]
+; CHECK-OUTLINE-O0-NEXT:    sturb w2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    strb w2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    strb w2, [x0, #4095]
@@ -1062,6 +1468,15 @@ define void @atomic_store_relaxed_16(ptr %p, i32 %off32, i16 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    strh w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    strh w2, [x0, #8190]
+; CHECK-OUTLINE-O1-NEXT:    strh w2, [x0, w1, sxtw #1]
+; CHECK-OUTLINE-O1-NEXT:    sturh w2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    strh w2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    strh w2, [x0, #8190]
@@ -1071,6 +1486,15 @@ define void @atomic_store_relaxed_16(ptr %p, i32 %off32, i16 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    strh w2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    strh w2, [x0, #8190]
+; CHECK-OUTLINE-O0-NEXT:    strh w2, [x0, w1, sxtw #1]
+; CHECK-OUTLINE-O0-NEXT:    sturh w2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    strh w2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    strh w2, [x0, #8190]
@@ -1113,6 +1537,15 @@ define void @atomic_store_relaxed_32(ptr %p, i32 %off32, i32 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    str w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    str w2, [x0, #16380]
+; CHECK-OUTLINE-O1-NEXT:    str w2, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O1-NEXT:    stur w2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    str w2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_32:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    str w2, [x0, #16380]
@@ -1122,6 +1555,15 @@ define void @atomic_store_relaxed_32(ptr %p, i32 %off32, i32 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    str w2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    str w2, [x0, #16380]
+; CHECK-OUTLINE-O0-NEXT:    str w2, [x0, w1, sxtw #2]
+; CHECK-OUTLINE-O0-NEXT:    stur w2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    str w2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_32:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    str w2, [x0, #16380]
@@ -1164,6 +1606,15 @@ define void @atomic_store_relaxed_64(ptr %p, i32 %off32, i64 %val) #0 {
 ; CHECK-NOLSE-O1-NEXT:    str x2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomic_store_relaxed_64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O1-NEXT:    str x2, [x0, #32760]
+; CHECK-OUTLINE-O1-NEXT:    str x2, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O1-NEXT:    stur x2, [x0, #-256]
+; CHECK-OUTLINE-O1-NEXT:    str x2, [x8]
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomic_store_relaxed_64:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    str x2, [x0, #32760]
@@ -1173,6 +1624,15 @@ define void @atomic_store_relaxed_64(ptr %p, i32 %off32, i64 %val) #0 {
 ; CHECK-NOLSE-O0-NEXT:    str x2, [x8]
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomic_store_relaxed_64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    str x2, [x0, #32760]
+; CHECK-OUTLINE-O0-NEXT:    str x2, [x0, w1, sxtw #3]
+; CHECK-OUTLINE-O0-NEXT:    stur x2, [x0, #-256]
+; CHECK-OUTLINE-O0-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
+; CHECK-OUTLINE-O0-NEXT:    str x2, [x8]
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomic_store_relaxed_64:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    str x2, [x0, #32760]
@@ -1213,6 +1673,13 @@ define i32 @load_zext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w9, w8, uxtb
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: load_zext:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldarb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x1]
+; CHECK-OUTLINE-O1-NEXT:    add w0, w9, w8, uxtb
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: load_zext:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldarb w9, [x0]
@@ -1220,6 +1687,13 @@ define i32 @load_zext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, uxtb
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: load_zext:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldarb w9, [x0]
+; CHECK-OUTLINE-O0-NEXT:    ldrh w8, [x1]
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_zext:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaprb w8, [x0]
@@ -1250,6 +1724,12 @@ define { i32, i64 } @load_acq(ptr %p32, ptr %p64) {
 ; CHECK-NOLSE-NEXT:    ldar x1, [x1]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: load_acq:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar w0, [x0]
+; CHECK-OUTLINE-NEXT:    ldar x1, [x1]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_acq:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar w0, [x0]
@@ -1279,6 +1759,14 @@ define i32 @load_sext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O1-NEXT:    add w0, w9, w8, sxtb
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: load_sext:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    ldarb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    ldrh w9, [x1]
+; CHECK-OUTLINE-O1-NEXT:    sxth w9, w9
+; CHECK-OUTLINE-O1-NEXT:    add w0, w9, w8, sxtb
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: load_sext:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    ldarb w9, [x0]
@@ -1287,6 +1775,14 @@ define i32 @load_sext(ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-O0-NEXT:    add w0, w8, w9, sxtb
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: load_sext:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    ldarb w9, [x0]
+; CHECK-OUTLINE-O0-NEXT:    ldrh w8, [x1]
+; CHECK-OUTLINE-O0-NEXT:    sxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    add w0, w8, w9, sxtb
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_sext:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaprb w8, [x0]
@@ -1319,6 +1815,12 @@ define void @store_trunc(i32 %val, ptr %p8, ptr %p16) {
 ; CHECK-NOLSE-NEXT:    strh w0, [x2]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: store_trunc:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    stlrb w0, [x1]
+; CHECK-OUTLINE-NEXT:    strh w0, [x2]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: store_trunc:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    stlrb w0, [x1]
@@ -1352,6 +1854,19 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd1_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1392,6 +1907,21 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd1_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_add_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaddalb w1, w0, [x0]
@@ -1418,6 +1948,19 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp1_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1457,6 +2000,21 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    swpb w1, w0, [x0]
@@ -1483,6 +2041,19 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd1_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1523,6 +2094,23 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, wzr
+; CHECK-OUTLINE-O0-NEXT:    subs w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd1_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_sub_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    neg w8, w1
@@ -1551,6 +2139,20 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr1_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1591,6 +2193,23 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr1_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_and_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mvn w8, w1
@@ -1619,6 +2238,19 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset1_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1659,6 +2291,21 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset1_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_or_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsetalb w1, w0, [x0]
@@ -1685,6 +2332,19 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor1_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1725,6 +2385,21 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xor_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldeorb w1, w0, [x0]
@@ -1753,6 +2428,20 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB33_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxtb w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxtb
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-OUTLINE-O1-NEXT:    stxrb w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB33_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1795,6 +2484,42 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB33_1
+; CHECK-OUTLINE-O0-NEXT:  LBB33_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxtb w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB33_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB33_2
+; CHECK-OUTLINE-O0-NEXT:  LBB33_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_min_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsminab w1, w0, [x0]
@@ -1823,6 +2548,20 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB34_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxtb w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxtb
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxrb w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB34_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1865,6 +2604,42 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB34_1
+; CHECK-OUTLINE-O0-NEXT:  LBB34_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxtb w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB34_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB34_2
+; CHECK-OUTLINE-O0-NEXT:  LBB34_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_max_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsmaxlb w1, w0, [x0]
@@ -1894,6 +2669,21 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xff
+; CHECK-OUTLINE-O1-NEXT:  LBB35_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxrb w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB35_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -1936,6 +2726,42 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB35_1
+; CHECK-OUTLINE-O0-NEXT:  LBB35_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w9, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB35_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB35_2
+; CHECK-OUTLINE-O0-NEXT:  LBB35_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umin_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    lduminalb w1, w0, [x0]
@@ -1965,6 +2791,21 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xff
+; CHECK-OUTLINE-O1-NEXT:  LBB36_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrb w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    stxrb w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB36_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2007,6 +2848,42 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrb w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB36_1
+; CHECK-OUTLINE-O0-NEXT:  LBB36_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w9, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxtb
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w9, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w9, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB36_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB36_2
+; CHECK-OUTLINE-O0-NEXT:  LBB36_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umax_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldumaxb w1, w0, [x0]
@@ -2033,6 +2910,19 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd2_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2073,6 +2963,21 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd2_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_add_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldaddalh w1, w0, [x0]
@@ -2099,6 +3004,19 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp2_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2138,6 +3056,21 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    swph w1, w0, [x0]
@@ -2164,6 +3097,19 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd2_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2204,6 +3150,23 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, wzr
+; CHECK-OUTLINE-O0-NEXT:    subs w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd2_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_sub_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    neg w8, w1
@@ -2232,6 +3195,20 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr2_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2272,6 +3249,23 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr2_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_and_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mvn w8, w1
@@ -2300,6 +3294,19 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset2_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2340,6 +3347,21 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset2_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_or_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsetalh w1, w0, [x0]
@@ -2366,6 +3388,19 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor2_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2406,6 +3441,21 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_xor_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldeorh w1, w0, [x0]
@@ -2434,6 +3484,20 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB43_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxth w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxth
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-OUTLINE-O1-NEXT:    stxrh w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB43_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2476,6 +3540,42 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB43_1
+; CHECK-OUTLINE-O0-NEXT:  LBB43_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB43_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB43_2
+; CHECK-OUTLINE-O0-NEXT:  LBB43_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_min_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsminah w1, w0, [x0]
@@ -2504,6 +3604,20 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB44_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    sxth w9, w8
+; CHECK-OUTLINE-O1-NEXT:    cmp w9, w1, sxth
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxrh w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB44_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2546,6 +3660,42 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB44_1
+; CHECK-OUTLINE-O0-NEXT:  LBB44_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    sxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, sxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB44_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB44_2
+; CHECK-OUTLINE-O0-NEXT:  LBB44_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_max_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldsmaxlh w1, w0, [x0]
@@ -2575,6 +3725,21 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xffff
+; CHECK-OUTLINE-O1-NEXT:  LBB45_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxrh w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB45_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2617,6 +3782,42 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB45_1
+; CHECK-OUTLINE-O0-NEXT:  LBB45_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB45_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB45_2
+; CHECK-OUTLINE-O0-NEXT:  LBB45_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umin_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    lduminalh w1, w0, [x0]
@@ -2646,6 +3847,21 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:    mov w0, w8
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    and w9, w1, #0xffff
+; CHECK-OUTLINE-O1-NEXT:  LBB46_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxrh w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    stxrh w11, w10, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB46_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
@@ -2688,6 +3904,42 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldrh w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB46_1
+; CHECK-OUTLINE-O0-NEXT:  LBB46_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w9, w0
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w9, w8, uxth
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    uxth w8, w8
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w0, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB46_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB46_2
+; CHECK-OUTLINE-O0-NEXT:  LBB46_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: atomicrmw_umax_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldumaxh w1, w0, [x0]
@@ -2726,6 +3978,28 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i8:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas1_relax
+; CHECK-OUTLINE-O1-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w19, uxtb
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: cmpxchg_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -2742,6 +4016,27 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-NOLSE-O0-NEXT:    cset w1, eq
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i8:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas1_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w1, uxtb
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: cmpxchg_i8:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov x8, x1
@@ -2790,6 +4085,28 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i16:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas2_relax
+; CHECK-OUTLINE-O1-NEXT:    and w8, w0, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w19, uxth
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
 ; CHECK-NOLSE-O0-LABEL: cmpxchg_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
@@ -2806,6 +4123,27 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-NOLSE-O0-NEXT:    cset w1, eq
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i16:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas2_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    and w8, w0, #0xffff
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w8, w1, uxth
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: cmpxchg_i16:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    mov x8, x1
@@ -2836,6 +4174,12 @@ define internal double @bitcast_to_double(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    fmov d0, x8
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: bitcast_to_double:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar x8, [x0]
+; CHECK-OUTLINE-NEXT:    fmov d0, x8
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: bitcast_to_double:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar x8, [x0]
@@ -2859,6 +4203,12 @@ define internal float @bitcast_to_float(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    fmov s0, w8
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: bitcast_to_float:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar w8, [x0]
+; CHECK-OUTLINE-NEXT:    fmov s0, w8
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: bitcast_to_float:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar w8, [x0]
@@ -2883,6 +4233,13 @@ define internal half @bitcast_to_half(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    ; kill: def $h0 killed $h0 killed $s0
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: bitcast_to_half:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldarh w8, [x0]
+; CHECK-OUTLINE-NEXT:    fmov s0, w8
+; CHECK-OUTLINE-NEXT:    ; kill: def $h0 killed $h0 killed $s0
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: bitcast_to_half:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldarh w8, [x0]
@@ -2907,6 +4264,11 @@ define internal ptr @inttoptr(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    ldar x0, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: inttoptr:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar x0, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: inttoptr:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar x0, [x0]
@@ -2927,6 +4289,11 @@ define internal ptr @load_ptr(ptr %ptr) {
 ; CHECK-NOLSE-NEXT:    ldar x0, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
+; CHECK-OUTLINE-LABEL: load_ptr:
+; CHECK-OUTLINE:       ; %bb.0:
+; CHECK-OUTLINE-NEXT:    ldar x0, [x0]
+; CHECK-OUTLINE-NEXT:    ret
+;
 ; CHECK-LSE-O1-LABEL: load_ptr:
 ; CHECK-LSE-O1:       ; %bb.0:
 ; CHECK-LSE-O1-NEXT:    ldar x0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ae15e74a43277a..5f412d20196c20 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -203,7 +203,6 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_SUB (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_AND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
@@ -226,7 +225,6 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_MIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_UMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices

>From 454c46aa5566fe86b8e08e66836473220096f85e Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Wed, 6 Dec 2023 13:32:27 +0000
Subject: [PATCH 2/8] Move ATOMICRMW_SUB to ATOMICRMW_ADD conversion to
 libcall()

---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 21 +++++++++----------
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  9 +-------
 .../GlobalISel/legalizer-info-validation.mir  |  1 +
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 186937e597c5bc..683b614192f457 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -824,6 +824,10 @@ getOutlineAtomicLibcall(unsigned Opc, AtomicOrdering Order, uint64_t MemSize) {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
     return LC[ModeN][ModelN];
   }
+  case TargetOpcode::G_ATOMICRMW_SUB: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
+    return LC[ModeN][ModelN];
+  }
   case TargetOpcode::G_ATOMICRMW_AND: {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
     return LC[ModeN][ModelN];
@@ -879,6 +883,7 @@ createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
   }
   case TargetOpcode::G_ATOMICRMW_XCHG:
   case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_SUB:
   case TargetOpcode::G_ATOMICRMW_AND:
   case TargetOpcode::G_ATOMICRMW_OR:
   case TargetOpcode::G_ATOMICRMW_XOR: {
@@ -889,6 +894,10 @@ createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
       Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
       MIRBuilder.buildXor(Tmp, MIRBuilder.buildConstant(ValLLT, -1), Val);
       Val = Tmp;
+    } else if (Opc == TargetOpcode::G_ATOMICRMW_SUB) {
+      Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
+      MIRBuilder.buildSub(Tmp, MIRBuilder.buildConstant(ValLLT, 0), Val);
+      Val = Tmp;
     }
     Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
@@ -1182,6 +1191,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
   }
   case TargetOpcode::G_ATOMICRMW_XCHG:
   case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_SUB:
   case TargetOpcode::G_ATOMICRMW_AND:
   case TargetOpcode::G_ATOMICRMW_OR:
   case TargetOpcode::G_ATOMICRMW_XOR:
@@ -3965,17 +3975,6 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerTRUNC(MI);
   GISEL_VECREDUCE_CASES_NONSEQ
     return lowerVectorReduction(MI);
-  case G_ATOMICRMW_SUB: {
-    auto Val = MI.getOperand(2).getReg();
-    LLT ValLLT = MRI.getType(Val);
-    Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
-    MIRBuilder.buildSub(Tmp, MIRBuilder.buildConstant(ValLLT, 0), Val);
-    auto [Ret, Mem] = MI.getFirst2Regs();
-    auto &MMO = cast<GMemOperation>(MI).getMMO();
-    MIRBuilder.buildAtomicRMWAdd(Ret, Mem, Tmp, MMO);
-    MI.eraseFromParent();
-    return Legalized;
-  }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 7fce3e501db57c..cca06a2510c4f6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -768,7 +768,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
-                               G_ATOMICRMW_AND, G_ATOMICRMW_OR,
+                               G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
                                G_ATOMICRMW_XOR})
       .libcallIf([&ST](const LegalityQuery &Query) {
         return ST.outlineAtomics() && !ST.hasLSE();
@@ -776,13 +776,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampScalar(0, s32, s64)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
-  getActionDefinitionsBuilder(G_ATOMICRMW_SUB)
-      .lowerIf([&ST](const LegalityQuery &Query) {
-        return ST.outlineAtomics() && !ST.hasLSE();
-      })
-      .clampScalar(0, s32, s64)
-      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
-
   // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
   // Don't outline them unless
   // (1) high level <atomic> support approved:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 5f412d20196c20..a0c13e3a82f774 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -203,6 +203,7 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_SUB (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMICRMW_AND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices

>From 845d86ce320de79583107f5c312545ee771278c7 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Thu, 7 Dec 2023 19:36:49 +0000
Subject: [PATCH 3/8] Put legal first, then custom

---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |  5 ++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 28 ++++++++++---------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index e51a3ec9400543..6a59cd8f10f877 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -223,6 +223,11 @@ struct TypePairAndMemDesc {
   }
 };
 
+/// True iff P is false.
+template <typename Predicate> Predicate predNot(Predicate P) {
+  return [=](const LegalityQuery &Query) { return !P(Query); };
+}
+
 /// True iff P0 and P1 are true.
 template<typename Predicate>
 Predicate all(Predicate P0, Predicate P1) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index cca06a2510c4f6..8a0ff402336e14 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -757,24 +757,26 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lowerIf(
           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
 
+  LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
+    return ST.outlineAtomics() && !ST.hasLSE();
+  };
+
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
-      .libcallIf([&ST](const LegalityQuery &Query) {
-        return ST.outlineAtomics() && !ST.hasLSE();
-      })
-      .customIf([](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() == 128;
-      })
-      .clampScalar(0, s32, s64)
-      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
+                   predNot(UseOutlineAtomics)))
+      .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
+      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
+                     UseOutlineAtomics))
+      .clampScalar(0, s32, s64);
 
   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
                                G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
                                G_ATOMICRMW_XOR})
-      .libcallIf([&ST](const LegalityQuery &Query) {
-        return ST.outlineAtomics() && !ST.hasLSE();
-      })
-      .clampScalar(0, s32, s64)
-      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
+                   predNot(UseOutlineAtomics)))
+      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
+                     UseOutlineAtomics))
+      .clampScalar(0, s32, s64);
 
   // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
   // Don't outline them unless

>From f8488aa6a11701492394a6d887edb0e4a5d87891 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Fri, 15 Dec 2023 11:58:41 +0000
Subject: [PATCH 4/8] Share outline atomic libcall selection.

---
 llvm/include/llvm/CodeGen/RuntimeLibcalls.h   |  6 ++
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 70 +++++--------------
 llvm/lib/CodeGen/TargetLoweringBase.cpp       | 41 +++++++----
 3 files changed, 49 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
index 66642068151073..3a407c4a4d9406 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -82,6 +82,12 @@ namespace RTLIB {
   /// UNKNOWN_LIBCALL if there is none.
   Libcall getSYNC(unsigned Opc, MVT VT);
 
+  /// Return the outline atomics value for the given atomic ordering, access
+  /// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there
+  /// is none.
+  Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4],
+                                 AtomicOrdering Order, uint64_t MemSize);
+
   /// Return the outline atomics value for the given opcode, atomic ordering
   /// and type, or UNKNOWN_LIBCALL if there is none.
   Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 683b614192f457..ea5e212e6b21d5 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -765,46 +766,15 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   return LegalizerHelper::Legalized;
 }
 
-static RTLIB::Libcall
-getOutlineAtomicLibcall(unsigned Opc, AtomicOrdering Order, uint64_t MemSize) {
-  unsigned ModeN, ModelN;
-  switch (MemSize) {
-  case 1:
-    ModeN = 0;
-    break;
-  case 2:
-    ModeN = 1;
-    break;
-  case 4:
-    ModeN = 2;
-    break;
-  case 8:
-    ModeN = 3;
-    break;
-  case 16:
-    ModeN = 4;
-    break;
-  default:
-    return RTLIB::UNKNOWN_LIBCALL;
-  }
-
-  switch (Order) {
-  case AtomicOrdering::Monotonic:
-    ModelN = 0;
-    break;
-  case AtomicOrdering::Acquire:
-    ModelN = 1;
-    break;
-  case AtomicOrdering::Release:
-    ModelN = 2;
-    break;
-  case AtomicOrdering::AcquireRelease:
-  case AtomicOrdering::SequentiallyConsistent:
-    ModelN = 3;
-    break;
-  default:
+static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  auto &AtomicMI = cast<GMemOperation>(MI);
+  auto &MMO = AtomicMI.getMMO();
+  auto Ordering = MMO.getMergedOrdering();
+  LLT MemType = MMO.getMemoryType();
+  uint64_t MemSize = MemType.getSizeInBytes();
+  if (!MemType.isScalar())
     return RTLIB::UNKNOWN_LIBCALL;
-  }
 
 #define LCALLS(A, B)                                                           \
   { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
@@ -814,31 +784,28 @@ getOutlineAtomicLibcall(unsigned Opc, AtomicOrdering Order, uint64_t MemSize) {
   case TargetOpcode::G_ATOMIC_CMPXCHG:
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
   }
   case TargetOpcode::G_ATOMICRMW_XCHG: {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
-    return LC[ModeN][ModelN];
-  }
-  case TargetOpcode::G_ATOMICRMW_ADD: {
-    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
   }
+  case TargetOpcode::G_ATOMICRMW_ADD:
   case TargetOpcode::G_ATOMICRMW_SUB: {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
   }
   case TargetOpcode::G_ATOMICRMW_AND: {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
   }
   case TargetOpcode::G_ATOMICRMW_OR: {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
   }
   case TargetOpcode::G_ATOMICRMW_XOR: {
     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Ordering, MemSize);
   }
   default:
     return RTLIB::UNKNOWN_LIBCALL;
@@ -909,10 +876,7 @@ createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
 
   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
-  auto &AtomicMI = cast<GMemOperation>(MI);
-  auto Ordering = AtomicMI.getMMO().getMergedOrdering();
-  uint64_t MemSize = AtomicMI.getMemSize();
-  RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(Opc, Ordering, MemSize);
+  RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
   const char *Name = TLI.getLibcallName(RTLibcall);
 
   // Unsupported libcall on the target.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 2648c16bcd8d90..acbbfd9ddaf52d 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -520,27 +520,28 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) {
                       FREXP_PPCF128);
 }
 
-RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
-                                        MVT VT) {
+RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
+                                             AtomicOrdering Order,
+                                             uint64_t MemSize) {
   unsigned ModeN, ModelN;
-  switch (VT.SimpleTy) {
-  case MVT::i8:
+  switch (MemSize) {
+  case 1:
     ModeN = 0;
     break;
-  case MVT::i16:
+  case 2:
     ModeN = 1;
     break;
-  case MVT::i32:
+  case 4:
     ModeN = 2;
     break;
-  case MVT::i64:
+  case 8:
     ModeN = 3;
     break;
-  case MVT::i128:
+  case 16:
     ModeN = 4;
     break;
   default:
-    return UNKNOWN_LIBCALL;
+    return RTLIB::UNKNOWN_LIBCALL;
   }
 
   switch (Order) {
@@ -561,6 +562,16 @@ RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
     return UNKNOWN_LIBCALL;
   }
 
+  return LC[ModeN][ModelN];
+}
+
+RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
+                                        MVT VT) {
+  unsigned ModeN, ModelN;
+  if (!VT.isScalarInteger())
+    return UNKNOWN_LIBCALL;
+  uint64_t MemSize = VT.getScalarSizeInBits() / 8;
+
 #define LCALLS(A, B)                                                           \
   { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
 #define LCALL5(A)                                                              \
@@ -568,27 +579,27 @@ RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
   switch (Opc) {
   case ISD::ATOMIC_CMP_SWAP: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_CAS)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_SWAP: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_SWP)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_ADD: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDADD)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_OR: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDSET)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_CLR: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDCLR)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   case ISD::ATOMIC_LOAD_XOR: {
     const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDEOR)};
-    return LC[ModeN][ModelN];
+    return getOutlineAtomicHelper(LC, Order, MemSize);
   }
   default:
     return UNKNOWN_LIBCALL;

>From 8bc9d0f5ba0f2653222d492a95f7c46f7407186d Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Fri, 15 Dec 2023 13:50:59 +0000
Subject: [PATCH 5/8] Add 32 and 64 bits atomic tests for GlobalISel

---
 .../AArch64/GlobalISel/arm64-atomic.ll        | 2281 ++++++++++++++++-
 1 file changed, 2261 insertions(+), 20 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index dd516e4a1e6c74..986c36426fb539 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -3953,26 +3953,2069 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ret i16 %res
 }
 
+define i32 @atomicrmw_add_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_add_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB47_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    add w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB47_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd4_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB47_1
+; CHECK-NOLSE-O0-NEXT:  LBB47_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB47_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB47_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB47_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB47_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB47_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB47_2
+; CHECK-NOLSE-O0-NEXT:  LBB47_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB47_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB47_1
+; CHECK-NOLSE-O0-NEXT:    b LBB47_5
+; CHECK-NOLSE-O0-NEXT:  LBB47_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_add_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldaddal w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_add_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldaddal w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw add ptr %ptr, i32 %rhs seq_cst
+  ret i32 %res
+}
+
+define i32 @atomicrmw_xchg_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xchg_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:  LBB48_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    stxr w9, w1, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB48_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp4_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB48_1
+; CHECK-NOLSE-O0-NEXT:  LBB48_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB48_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w12, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:  LBB48_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB48_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB48_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB48_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB48_2
+; CHECK-NOLSE-O0-NEXT:  LBB48_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB48_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB48_1
+; CHECK-NOLSE-O0-NEXT:    b LBB48_5
+; CHECK-NOLSE-O0-NEXT:  LBB48_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    swp w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_xchg_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    swp w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw xchg ptr %ptr, i32 %rhs monotonic
+  ret i32 %res
+}
+
+define i32 @atomicrmw_sub_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_sub_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB49_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    sub w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB49_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd4_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB49_1
+; CHECK-NOLSE-O0-NEXT:  LBB49_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB49_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB49_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB49_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB49_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB49_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB49_2
+; CHECK-NOLSE-O0-NEXT:  LBB49_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB49_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB49_1
+; CHECK-NOLSE-O0-NEXT:    b LBB49_5
+; CHECK-NOLSE-O0-NEXT:  LBB49_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, wzr
+; CHECK-OUTLINE-O0-NEXT:    subs w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_sub_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    neg w8, w1
+; CHECK-LSE-O1-NEXT:    ldadda w8, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_sub_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    neg w8, w1
+; CHECK-LSE-O0-NEXT:    ldadda w8, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw sub ptr %ptr, i32 %rhs acquire
+  ret i32 %res
+}
+
+define i32 @atomicrmw_and_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_and_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB50_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    and w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB50_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor w0, w8, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr4_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB50_1
+; CHECK-NOLSE-O0-NEXT:  LBB50_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB50_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    and w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB50_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB50_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB50_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB50_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB50_2
+; CHECK-NOLSE-O0-NEXT:  LBB50_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB50_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB50_1
+; CHECK-NOLSE-O0-NEXT:    b LBB50_5
+; CHECK-NOLSE-O0-NEXT:  LBB50_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w9, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov w8, #-1 ; =0xffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor w0, w8, w9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr4_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_and_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mvn w8, w1
+; CHECK-LSE-O1-NEXT:    ldclrl w8, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_and_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mvn w8, w1
+; CHECK-LSE-O0-NEXT:    ldclrl w8, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw and ptr %ptr, i32 %rhs release
+  ret i32 %res
+}
+
+define i32 @atomicrmw_or_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_or_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB51_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    orr w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB51_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset4_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB51_1
+; CHECK-NOLSE-O0-NEXT:  LBB51_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB51_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    orr w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB51_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB51_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB51_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB51_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB51_2
+; CHECK-NOLSE-O0-NEXT:  LBB51_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB51_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB51_1
+; CHECK-NOLSE-O0-NEXT:    b LBB51_5
+; CHECK-NOLSE-O0-NEXT:  LBB51_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_or_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsetal w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_or_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsetal w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw or ptr %ptr, i32 %rhs seq_cst
+  ret i32 %res
+}
+
+define i32 @atomicrmw_xor_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xor_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB52_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    eor w9, w8, w1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB52_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor4_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB52_1
+; CHECK-NOLSE-O0-NEXT:  LBB52_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB52_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    eor w12, w8, w9
+; CHECK-NOLSE-O0-NEXT:  LBB52_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB52_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB52_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB52_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB52_2
+; CHECK-NOLSE-O0-NEXT:  LBB52_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB52_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB52_1
+; CHECK-NOLSE-O0-NEXT:    b LBB52_5
+; CHECK-NOLSE-O0-NEXT:  LBB52_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xor_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldeor w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_xor_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldeor w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw xor ptr %ptr, i32 %rhs monotonic
+  ret i32 %res
+}
+
+define i32 @atomicrmw_min_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_min_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB53_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, le
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB53_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB53_1
+; CHECK-NOLSE-O0-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB53_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, le
+; CHECK-NOLSE-O0-NEXT:  LBB53_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB53_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB53_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB53_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB53_2
+; CHECK-NOLSE-O0-NEXT:  LBB53_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB53_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB53_1
+; CHECK-NOLSE-O0-NEXT:    b LBB53_5
+; CHECK-NOLSE-O0-NEXT:  LBB53_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB53_1
+; CHECK-OUTLINE-O0-NEXT:  LBB53_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB53_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB53_2
+; CHECK-OUTLINE-O0-NEXT:  LBB53_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_min_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmina w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_min_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmina w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw min ptr %ptr, i32 %rhs acquire
+  ret i32 %res
+}
+
+define i32 @atomicrmw_max_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_max_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB54_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB54_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB54_1
+; CHECK-NOLSE-O0-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB54_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, gt
+; CHECK-NOLSE-O0-NEXT:  LBB54_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB54_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB54_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB54_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB54_2
+; CHECK-NOLSE-O0-NEXT:  LBB54_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB54_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB54_1
+; CHECK-NOLSE-O0-NEXT:    b LBB54_5
+; CHECK-NOLSE-O0-NEXT:  LBB54_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB54_1
+; CHECK-OUTLINE-O0-NEXT:  LBB54_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB54_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB54_2
+; CHECK-OUTLINE-O0-NEXT:  LBB54_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_max_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmaxl w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_max_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmaxl w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw max ptr %ptr, i32 %rhs release
+  ret i32 %res
+}
+
+define i32 @atomicrmw_umin_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umin_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, ls
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB55_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB55_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB55_1
+; CHECK-NOLSE-O0-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB55_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, ls
+; CHECK-NOLSE-O0-NEXT:  LBB55_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB55_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB55_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB55_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB55_2
+; CHECK-NOLSE-O0-NEXT:  LBB55_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB55_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB55_1
+; CHECK-NOLSE-O0-NEXT:    b LBB55_5
+; CHECK-NOLSE-O0-NEXT:  LBB55_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB55_1
+; CHECK-OUTLINE-O0-NEXT:  LBB55_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB55_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB55_2
+; CHECK-OUTLINE-O0-NEXT:  LBB55_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umin_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    lduminal w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umin_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    lduminal w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umin ptr %ptr, i32 %rhs seq_cst
+  ret i32 %res
+}
+
+define i32 @atomicrmw_umax_i32(ptr %ptr, i32 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umax_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w1
+; CHECK-NOLSE-O1-NEXT:    csel w9, w8, w1, hi
+; CHECK-NOLSE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB56_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr w8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w1
+; CHECK-OUTLINE-O1-NEXT:    csel w9, w8, w1, hi
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, w9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB56_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str w8, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB56_1
+; CHECK-NOLSE-O0-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB56_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs w10, w8, w9
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, hi
+; CHECK-NOLSE-O0-NEXT:  LBB56_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB56_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr w9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp w9, w8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB56_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB56_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, w12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB56_2
+; CHECK-NOLSE-O0-NEXT:  LBB56_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB56_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #12] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs w8, w9, w8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str w9, [sp, #28] ; 4-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB56_1
+; CHECK-NOLSE-O0-NEXT:    b LBB56_5
+; CHECK-NOLSE-O0-NEXT:  LBB56_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str w1, [sp, #24] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB56_1
+; CHECK-OUTLINE-O0-NEXT:  LBB56_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #28] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #8] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w9, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    csel w1, w0, w8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w8, [sp, #8] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #28] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB56_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB56_2
+; CHECK-OUTLINE-O0-NEXT:  LBB56_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #48
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umax_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldumax w1, w0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umax_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldumax w1, w0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umax ptr %ptr, i32 %rhs monotonic
+  ret i32 %res
+}
+
+define i64 @atomicrmw_add_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_add_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB57_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    add x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB57_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_add_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_add_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB57_1
+; CHECK-NOLSE-O0-NEXT:  LBB57_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB57_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB57_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB57_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB57_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB57_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB57_2
+; CHECK-NOLSE-O0-NEXT:  LBB57_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB57_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB57_1
+; CHECK-NOLSE-O0-NEXT:    b LBB57_5
+; CHECK-NOLSE-O0-NEXT:  LBB57_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_add_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_add_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldaddal x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_add_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldaddal x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw add ptr %ptr, i64 %rhs seq_cst
+  ret i64 %res
+}
+
+define i64 @atomicrmw_xchg_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xchg_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB58_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    stxr w9, x1, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB58_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_swp8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xchg_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB58_1
+; CHECK-NOLSE-O0-NEXT:  LBB58_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB58_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x12, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:  LBB58_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB58_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB58_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB58_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB58_2
+; CHECK-NOLSE-O0-NEXT:  LBB58_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB58_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB58_1
+; CHECK-NOLSE-O0-NEXT:    b LBB58_5
+; CHECK-NOLSE-O0-NEXT:  LBB58_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xchg_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_swp8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xchg_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    swp x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_xchg_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    swp x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw xchg ptr %ptr, i64 %rhs monotonic
+  ret i64 %res
+}
+
+define i64 @atomicrmw_sub_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_sub_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB59_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    sub x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB59_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_sub_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    neg x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldadd8_acq
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_sub_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB59_1
+; CHECK-NOLSE-O0-NEXT:  LBB59_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB59_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB59_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB59_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB59_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB59_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB59_2
+; CHECK-NOLSE-O0-NEXT:  LBB59_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB59_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB59_1
+; CHECK-NOLSE-O0-NEXT:    b LBB59_5
+; CHECK-NOLSE-O0-NEXT:  LBB59_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_sub_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x9, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov x8, xzr
+; CHECK-OUTLINE-O0-NEXT:    subs x0, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldadd8_acq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_sub_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    neg x8, x1
+; CHECK-LSE-O1-NEXT:    ldadda x8, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_sub_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    neg x8, x1
+; CHECK-LSE-O0-NEXT:    ldadda x8, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw sub ptr %ptr, i64 %rhs acquire
+  ret i64 %res
+}
+
+define i64 @atomicrmw_and_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_and_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB60_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    and x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB60_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_and_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x8, #-1 ; =0xffffffffffffffff
+; CHECK-OUTLINE-O1-NEXT:    eor x0, x8, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldclr8_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_and_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB60_1
+; CHECK-NOLSE-O0-NEXT:  LBB60_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB60_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    and x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB60_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB60_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB60_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB60_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB60_2
+; CHECK-NOLSE-O0-NEXT:  LBB60_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB60_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB60_1
+; CHECK-NOLSE-O0-NEXT:    b LBB60_5
+; CHECK-NOLSE-O0-NEXT:  LBB60_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_and_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x9, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    mov x8, #-1 ; =0xffffffffffffffff
+; CHECK-OUTLINE-O0-NEXT:    eor x0, x8, x9
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldclr8_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_and_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mvn x8, x1
+; CHECK-LSE-O1-NEXT:    ldclrl x8, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_and_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mvn x8, x1
+; CHECK-LSE-O0-NEXT:    ldclrl x8, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw and ptr %ptr, i64 %rhs release
+  ret i64 %res
+}
+
+define i64 @atomicrmw_or_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_or_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB61_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    orr x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB61_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_or_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldset8_acq_rel
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_or_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB61_1
+; CHECK-NOLSE-O0-NEXT:  LBB61_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB61_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    orr x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB61_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB61_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB61_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB61_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB61_2
+; CHECK-NOLSE-O0-NEXT:  LBB61_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB61_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB61_1
+; CHECK-NOLSE-O0-NEXT:    b LBB61_5
+; CHECK-NOLSE-O0-NEXT:  LBB61_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_or_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldset8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_or_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsetal x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_or_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsetal x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw or ptr %ptr, i64 %rhs seq_cst
+  ret i64 %res
+}
+
+define i64 @atomicrmw_xor_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_xor_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB62_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    eor x9, x8, x1
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB62_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_xor_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_ldeor8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_xor_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB62_1
+; CHECK-NOLSE-O0-NEXT:  LBB62_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB62_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    eor x12, x8, x9
+; CHECK-NOLSE-O0-NEXT:  LBB62_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB62_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB62_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB62_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB62_2
+; CHECK-NOLSE-O0-NEXT:  LBB62_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB62_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB62_1
+; CHECK-NOLSE-O0-NEXT:    b LBB62_5
+; CHECK-NOLSE-O0-NEXT:  LBB62_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_xor_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_ldeor8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_xor_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldeor x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_xor_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldeor x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw xor ptr %ptr, i64 %rhs monotonic
+  ret i64 %res
+}
+
+define i64 @atomicrmw_min_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_min_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, le
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB63_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_min_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, le
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB63_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_min_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB63_1
+; CHECK-NOLSE-O0-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB63_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, le
+; CHECK-NOLSE-O0-NEXT:  LBB63_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB63_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB63_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB63_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB63_2
+; CHECK-NOLSE-O0-NEXT:  LBB63_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB63_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB63_1
+; CHECK-NOLSE-O0-NEXT:    b LBB63_5
+; CHECK-NOLSE-O0-NEXT:  LBB63_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_min_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB63_1
+; CHECK-OUTLINE-O0-NEXT:  LBB63_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, le
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB63_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB63_2
+; CHECK-OUTLINE-O0-NEXT:  LBB63_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_min_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmina x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_min_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmina x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw min ptr %ptr, i64 %rhs acquire
+  ret i64 %res
+}
+
+define i64 @atomicrmw_max_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_max_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, gt
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB64_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_max_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, gt
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB64_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_max_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB64_1
+; CHECK-NOLSE-O0-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB64_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, gt
+; CHECK-NOLSE-O0-NEXT:  LBB64_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB64_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB64_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB64_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB64_2
+; CHECK-NOLSE-O0-NEXT:  LBB64_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB64_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB64_1
+; CHECK-NOLSE-O0-NEXT:    b LBB64_5
+; CHECK-NOLSE-O0-NEXT:  LBB64_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_max_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB64_1
+; CHECK-OUTLINE-O0-NEXT:  LBB64_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, gt
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB64_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB64_2
+; CHECK-OUTLINE-O0-NEXT:  LBB64_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_max_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldsmaxl x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_max_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldsmaxl x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw max ptr %ptr, i64 %rhs release
+  ret i64 %res
+}
+
+define i64 @atomicrmw_umin_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umin_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, ls
+; CHECK-NOLSE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB65_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umin_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldaxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, ls
+; CHECK-OUTLINE-O1-NEXT:    stlxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB65_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umin_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB65_1
+; CHECK-NOLSE-O0-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB65_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, ls
+; CHECK-NOLSE-O0-NEXT:  LBB65_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB65_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB65_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB65_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB65_2
+; CHECK-NOLSE-O0-NEXT:  LBB65_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB65_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB65_1
+; CHECK-NOLSE-O0-NEXT:    b LBB65_5
+; CHECK-NOLSE-O0-NEXT:  LBB65_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umin_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB65_1
+; CHECK-OUTLINE-O0-NEXT:  LBB65_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, ls
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_acq_rel
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB65_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB65_2
+; CHECK-OUTLINE-O0-NEXT:  LBB65_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umin_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    lduminal x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umin_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    lduminal x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umin ptr %ptr, i64 %rhs seq_cst
+  ret i64 %res
+}
+
+define i64 @atomicrmw_umax_i64(ptr %ptr, i64 %rhs) {
+; CHECK-NOLSE-O1-LABEL: atomicrmw_umax_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-NOLSE-O1-NEXT:    cmp x8, x1
+; CHECK-NOLSE-O1-NEXT:    csel x9, x8, x1, hi
+; CHECK-NOLSE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB66_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-NOLSE-O1-NEXT:    mov x0, x8
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: atomicrmw_umax_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O1-NEXT:    ldxr x8, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cmp x8, x1
+; CHECK-OUTLINE-O1-NEXT:    csel x9, x8, x1, hi
+; CHECK-OUTLINE-O1-NEXT:    stxr w10, x9, [x0]
+; CHECK-OUTLINE-O1-NEXT:    cbnz w10, LBB66_1
+; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x8
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: atomicrmw_umax_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOLSE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    str x1, [sp, #16] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [x0]
+; CHECK-NOLSE-O0-NEXT:    str x8, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    b LBB66_1
+; CHECK-NOLSE-O0-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB66_2 Depth 2
+; CHECK-NOLSE-O0-NEXT:    ldr x8, [sp, #24] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #8] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    ldr x9, [sp, #16] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    subs x10, x8, x9
+; CHECK-NOLSE-O0-NEXT:    csel x12, x8, x9, hi
+; CHECK-NOLSE-O0-NEXT:  LBB66_2: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB66_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NOLSE-O0-NEXT:    ldaxr x9, [x11]
+; CHECK-NOLSE-O0-NEXT:    cmp x9, x8
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB66_4
+; CHECK-NOLSE-O0-NEXT:  ; %bb.3: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB66_2 Depth=2
+; CHECK-NOLSE-O0-NEXT:    stlxr w10, x12, [x11]
+; CHECK-NOLSE-O0-NEXT:    cbnz w10, LBB66_2
+; CHECK-NOLSE-O0-NEXT:  LBB66_4: ; %atomicrmw.start
+; CHECK-NOLSE-O0-NEXT:    ; in Loop: Header=BB66_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    subs x8, x9, x8
+; CHECK-NOLSE-O0-NEXT:    cset w8, eq
+; CHECK-NOLSE-O0-NEXT:    str x9, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NOLSE-O0-NEXT:    tbz w8, #0, LBB66_1
+; CHECK-NOLSE-O0-NEXT:    b LBB66_5
+; CHECK-NOLSE-O0-NEXT:  LBB66_5: ; %atomicrmw.end
+; CHECK-NOLSE-O0-NEXT:    ldr x0, [sp] ; 8-byte Folded Reload
+; CHECK-NOLSE-O0-NEXT:    add sp, sp, #32
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: atomicrmw_umax_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    str x1, [sp, #32] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [x0]
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    b LBB66_1
+; CHECK-OUTLINE-O0-NEXT:  LBB66_1: ; %atomicrmw.start
+; CHECK-OUTLINE-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp, #24] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #32] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x9, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    csel x1, x0, x8, hi
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr x8, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x8
+; CHECK-OUTLINE-O0-NEXT:    cset w8, eq
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #40] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    tbz w8, #0, LBB66_1
+; CHECK-OUTLINE-O0-NEXT:    b LBB66_2
+; CHECK-OUTLINE-O0-NEXT:  LBB66_2: ; %atomicrmw.end
+; CHECK-OUTLINE-O0-NEXT:    ldr x0, [sp, #16] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #64
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: atomicrmw_umax_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    ldumax x1, x0, [x0]
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: atomicrmw_umax_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    ldumax x1, x0, [x0]
+; CHECK-LSE-O0-NEXT:    ret
+  %res = atomicrmw umax ptr %ptr, i64 %rhs monotonic
+  ret i64 %res
+}
+
 define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-NOLSE-O1-LABEL: cmpxchg_i8:
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:    mov x8, x0
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
-; CHECK-NOLSE-O1-NEXT:  LBB47_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:  LBB67_1: ; %cmpxchg.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrb w0, [x8]
 ; CHECK-NOLSE-O1-NEXT:    and w9, w0, #0xff
 ; CHECK-NOLSE-O1-NEXT:    cmp w9, w1, uxtb
-; CHECK-NOLSE-O1-NEXT:    b.ne LBB47_4
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB67_4
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
-; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB47_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB67_1 Depth=1
 ; CHECK-NOLSE-O1-NEXT:    stxrb w9, w2, [x8]
-; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB47_1
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB67_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
 ; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
-; CHECK-NOLSE-O1-NEXT:  LBB47_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:  LBB67_4: ; %cmpxchg.nostore
 ; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
 ; CHECK-NOLSE-O1-NEXT:    clrex
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
@@ -4003,14 +6046,14 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-NOLSE-O0-LABEL: cmpxchg_i8:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
-; CHECK-NOLSE-O0-NEXT:  LBB47_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:  LBB67_1: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ldaxrb w0, [x9]
 ; CHECK-NOLSE-O0-NEXT:    cmp w0, w1, uxtb
-; CHECK-NOLSE-O0-NEXT:    b.ne LBB47_3
-; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB47_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB67_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB67_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    stlxrb w8, w2, [x9]
-; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB47_1
-; CHECK-NOLSE-O0-NEXT:  LBB47_3:
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB67_1
+; CHECK-NOLSE-O0-NEXT:  LBB67_3:
 ; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xff
 ; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxtb
 ; CHECK-NOLSE-O0-NEXT:    cset w1, eq
@@ -4065,21 +6108,21 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:    mov x8, x0
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
-; CHECK-NOLSE-O1-NEXT:  LBB48_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:  LBB68_1: ; %cmpxchg.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrh w0, [x8]
 ; CHECK-NOLSE-O1-NEXT:    and w9, w0, #0xffff
 ; CHECK-NOLSE-O1-NEXT:    cmp w9, w1, uxth
-; CHECK-NOLSE-O1-NEXT:    b.ne LBB48_4
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB68_4
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
-; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB48_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB68_1 Depth=1
 ; CHECK-NOLSE-O1-NEXT:    stxrh w9, w2, [x8]
-; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB48_1
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB68_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
 ; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
-; CHECK-NOLSE-O1-NEXT:  LBB48_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:  LBB68_4: ; %cmpxchg.nostore
 ; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
 ; CHECK-NOLSE-O1-NEXT:    clrex
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
@@ -4110,14 +6153,14 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-NOLSE-O0-LABEL: cmpxchg_i16:
 ; CHECK-NOLSE-O0:       ; %bb.0:
 ; CHECK-NOLSE-O0-NEXT:    mov x9, x0
-; CHECK-NOLSE-O0-NEXT:  LBB48_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:  LBB68_1: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ldaxrh w0, [x9]
 ; CHECK-NOLSE-O0-NEXT:    cmp w0, w1, uxth
-; CHECK-NOLSE-O0-NEXT:    b.ne LBB48_3
-; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB48_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB68_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB68_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    stlxrh w8, w2, [x9]
-; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB48_1
-; CHECK-NOLSE-O0-NEXT:  LBB48_3:
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB68_1
+; CHECK-NOLSE-O0-NEXT:  LBB68_3:
 ; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xffff
 ; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxth
 ; CHECK-NOLSE-O0-NEXT:    cset w1, eq
@@ -4167,6 +6210,204 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ret { i16, i1 } %res
 }
 
+define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_i32:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:  LBB69_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    cmp w0, w1
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB69_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB69_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxr w9, w2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB69_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB69_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i32:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov w19, w1
+; CHECK-OUTLINE-O1-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O1-NEXT:    mov w0, w19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas4_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp w0, w19
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_i32:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB69_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxr w0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp w0, w1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB69_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB69_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxr w8, w2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB69_1
+; CHECK-NOLSE-O0-NEXT:  LBB69_3:
+; CHECK-NOLSE-O0-NEXT:    subs w8, w0, w1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i32:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w0, w1
+; CHECK-OUTLINE-O0-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov w1, w2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas4_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr w1, [sp, #12] ; 4-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    subs w8, w0, w1
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_i32:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    cas w8, w2, [x0]
+; CHECK-LSE-O1-NEXT:    cmp w8, w1
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_i32:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    cas w0, w2, [x8]
+; CHECK-LSE-O0-NEXT:    subs w8, w0, w1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, i32 %desired, i32 %new monotonic monotonic
+  ret { i32, i1 } %res
+}
+
+define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_i64:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:  LBB70_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x0, [x8]
+; CHECK-NOLSE-O1-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB70_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB70_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxr w9, x2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB70_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB70_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_i64:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp x0, x19
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_i64:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB70_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxr x0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB70_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB70_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxr w8, x2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB70_1
+; CHECK-NOLSE-O0-NEXT:  LBB70_3:
+; CHECK-NOLSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_i64:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x1
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_i64:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    cas x8, x2, [x0]
+; CHECK-LSE-O1-NEXT:    cmp x8, x1
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_i64:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    cas x0, x2, [x8]
+; CHECK-LSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, i64 %desired, i64 %new monotonic monotonic
+  ret { i64, i1 } %res
+}
+
 define internal double @bitcast_to_double(ptr %ptr) {
 ; CHECK-NOLSE-LABEL: bitcast_to_double:
 ; CHECK-NOLSE:       ; %bb.0:

>From 904204a25c2e29e6379428fa20db96abd590d6c6 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Fri, 15 Dec 2023 13:56:28 +0000
Subject: [PATCH 6/8] Do not duplicate explanation for min/max outlining

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 8a0ff402336e14..1c1db11eda5150 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -778,12 +778,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                      UseOutlineAtomics))
       .clampScalar(0, s32, s64);
 
-  // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
-  // Don't outline them unless
-  // (1) high level <atomic> support approved:
-  //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
-  // (2) low level libgcc and compiler-rt support implemented by:
-  //   min/max outline atomics helpers
+  // Do not outline these atomics operations, as per comment in
+  // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
   getActionDefinitionsBuilder(
       {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
       .clampScalar(0, s32, s64)

>From 3ac8ba156be30a7af27b30799f445155bcaf6c49 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Fri, 15 Dec 2023 17:48:28 +0000
Subject: [PATCH 7/8] Fix codestyle

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a779ae53e75d06..601b3c9f365292 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -766,7 +766,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                    predNot(UseOutlineAtomics)))
       .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
       .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() == 128 && !UseOutlineAtomics(Query);
+        return Query.Types[0].getSizeInBits() == 128 &&
+               !UseOutlineAtomics(Query);
       })
       .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
                      UseOutlineAtomics))

>From 6fc3b3bb741abd1afe870c21165f0440f1fbb63b Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme at arm.com>
Date: Mon, 18 Dec 2023 11:13:10 +0000
Subject: [PATCH 8/8] Allow ptr atomics & remove stray comment.

---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  6 +-
 .../AArch64/GlobalISel/arm64-atomic.ll        | 98 +++++++++++++++++++
 2 files changed, 99 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 2151cc1e81d4ed..f893c70690d534 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -773,7 +773,7 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
   auto Ordering = MMO.getMergedOrdering();
   LLT MemType = MMO.getMemoryType();
   uint64_t MemSize = MemType.getSizeInBytes();
-  if (!MemType.isScalar())
+  if (MemType.isVector())
     return RTLIB::UNKNOWN_LIBCALL;
 
 #define LCALLS(A, B)                                                           \
@@ -819,10 +819,6 @@ createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
 
-  // Add all the args, except for the last which is an imm denoting 'tail'.
-  // const CallLowering::ArgInfo &Result,
-  // Operand 0 & 1 are return: 0 is old val, 1 is success, 2-4 are reg operands:
-  // 2 is ptr, 3 is expected, 4 is new
   Type *RetTy;
   SmallVector<Register> RetRegs;
   SmallVector<CallLowering::ArgInfo, 3> Args;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index 986c36426fb539..739332414c1985 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -6408,6 +6408,104 @@ define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) {
   ret { i64, i1 } %res
 }
 
+define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) {
+; CHECK-NOLSE-O1-LABEL: cmpxchg_ptr:
+; CHECK-NOLSE-O1:       ; %bb.0:
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
+; CHECK-NOLSE-O1-NEXT:  LBB71_1: ; %cmpxchg.start
+; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O1-NEXT:    ldxr x0, [x8]
+; CHECK-NOLSE-O1-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O1-NEXT:    b.ne LBB71_4
+; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %cmpxchg.trystore
+; CHECK-NOLSE-O1-NEXT:    ; in Loop: Header=BB71_1 Depth=1
+; CHECK-NOLSE-O1-NEXT:    stxr w9, x2, [x8]
+; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB71_1
+; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NOLSE-O1-NEXT:    ret
+; CHECK-NOLSE-O1-NEXT:  LBB71_4: ; %cmpxchg.nostore
+; CHECK-NOLSE-O1-NEXT:    mov w1, wzr
+; CHECK-NOLSE-O1-NEXT:    clrex
+; CHECK-NOLSE-O1-NEXT:    ret
+;
+; CHECK-OUTLINE-O1-LABEL: cmpxchg_ptr:
+; CHECK-OUTLINE-O1:       ; %bb.0:
+; CHECK-OUTLINE-O1-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O1-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w19, -24
+; CHECK-OUTLINE-O1-NEXT:    .cfi_offset w20, -32
+; CHECK-OUTLINE-O1-NEXT:    mov x3, x0
+; CHECK-OUTLINE-O1-NEXT:    mov x19, x1
+; CHECK-OUTLINE-O1-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O1-NEXT:    mov x0, x19
+; CHECK-OUTLINE-O1-NEXT:    mov x2, x3
+; CHECK-OUTLINE-O1-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O1-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    cmp x0, x19
+; CHECK-OUTLINE-O1-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O1-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-OUTLINE-O1-NEXT:    ret
+;
+; CHECK-NOLSE-O0-LABEL: cmpxchg_ptr:
+; CHECK-NOLSE-O0:       ; %bb.0:
+; CHECK-NOLSE-O0-NEXT:    mov x9, x0
+; CHECK-NOLSE-O0-NEXT:  LBB71_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NOLSE-O0-NEXT:    ldaxr x0, [x9]
+; CHECK-NOLSE-O0-NEXT:    cmp x0, x1
+; CHECK-NOLSE-O0-NEXT:    b.ne LBB71_3
+; CHECK-NOLSE-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB71_1 Depth=1
+; CHECK-NOLSE-O0-NEXT:    stlxr w8, x2, [x9]
+; CHECK-NOLSE-O0-NEXT:    cbnz w8, LBB71_1
+; CHECK-NOLSE-O0-NEXT:  LBB71_3:
+; CHECK-NOLSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
+; CHECK-NOLSE-O0-NEXT:    ret
+;
+; CHECK-OUTLINE-O0-LABEL: cmpxchg_ptr:
+; CHECK-OUTLINE-O0:       ; %bb.0:
+; CHECK-OUTLINE-O0-NEXT:    sub sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w30, -8
+; CHECK-OUTLINE-O0-NEXT:    .cfi_offset w29, -16
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x0, x1
+; CHECK-OUTLINE-O0-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-OUTLINE-O0-NEXT:    mov x1, x2
+; CHECK-OUTLINE-O0-NEXT:    ldr x2, [sp] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    bl ___aarch64_cas8_relax
+; CHECK-OUTLINE-O0-NEXT:    ldr x1, [sp, #8] ; 8-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    subs x8, x0, x1
+; CHECK-OUTLINE-O0-NEXT:    cset w1, eq
+; CHECK-OUTLINE-O0-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-OUTLINE-O0-NEXT:    add sp, sp, #32
+; CHECK-OUTLINE-O0-NEXT:    ret
+;
+; CHECK-LSE-O1-LABEL: cmpxchg_ptr:
+; CHECK-LSE-O1:       ; %bb.0:
+; CHECK-LSE-O1-NEXT:    mov x8, x1
+; CHECK-LSE-O1-NEXT:    cas x8, x2, [x0]
+; CHECK-LSE-O1-NEXT:    cmp x8, x1
+; CHECK-LSE-O1-NEXT:    cset w1, eq
+; CHECK-LSE-O1-NEXT:    mov x0, x8
+; CHECK-LSE-O1-NEXT:    ret
+;
+; CHECK-LSE-O0-LABEL: cmpxchg_ptr:
+; CHECK-LSE-O0:       ; %bb.0:
+; CHECK-LSE-O0-NEXT:    mov x8, x0
+; CHECK-LSE-O0-NEXT:    mov x0, x1
+; CHECK-LSE-O0-NEXT:    cas x0, x2, [x8]
+; CHECK-LSE-O0-NEXT:    subs x8, x0, x1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
+; CHECK-LSE-O0-NEXT:    ret
+  %res = cmpxchg ptr %ptr, ptr %desired, ptr %new monotonic monotonic
+  ret { ptr, i1 } %res
+}
+
 define internal double @bitcast_to_double(ptr %ptr) {
 ; CHECK-NOLSE-LABEL: bitcast_to_double:
 ; CHECK-NOLSE:       ; %bb.0:



More information about the cfe-commits mailing list