[llvm] Add out-of-line-atomics support to GlobalISel (PR #74588)

Wed Dec 6 04:21:02 PST 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Thomas Preud'homme (RoboTux)

<details>
<summary>Changes</summary>

This patch implement the GlobalISel counterpart to
4d7df43ffdb460dddb2877a886f75f45c3fee188.


---

Patch is 363.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74588.diff


9 Files Affected:

- (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+183) 
- (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+26-3) 
- (modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll (+8-40) 
- (modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll (+16-32) 
- (modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll (+955-1425) 
- (modified) llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-outline_atomics.ll (+363-1320) 
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll (+239) 
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll (+1367) 
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir (-2) 


``````````diff

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 045fc78218dae..186937e597c5b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -765,6 +765,166 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   return LegalizerHelper::Legalized;
 }
 
+static RTLIB::Libcall
+getOutlineAtomicLibcall(unsigned Opc, AtomicOrdering Order, uint64_t MemSize) {
+  unsigned ModeN, ModelN;
+  switch (MemSize) {
+  case 1:
+    ModeN = 0;
+    break;
+  case 2:
+    ModeN = 1;
+    break;
+  case 4:
+    ModeN = 2;
+    break;
+  case 8:
+    ModeN = 3;
+    break;
+  case 16:
+    ModeN = 4;
+    break;
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+
+  switch (Order) {
+  case AtomicOrdering::Monotonic:
+    ModelN = 0;
+    break;
+  case AtomicOrdering::Acquire:
+    ModelN = 1;
+    break;
+  case AtomicOrdering::Release:
+    ModelN = 2;
+    break;
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
+    ModelN = 3;
+    break;
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+
+#define LCALLS(A, B)                                                           \
+  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALL5(A)                                                              \
+  LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_ADD: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_AND: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_OR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
+    return LC[ModeN][ModelN];
+  }
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
+    return LC[ModeN][ModelN];
+  }
+  default:
+    return RTLIB::UNKNOWN_LIBCALL;
+  }
+#undef LCALLS
+#undef LCALL5
+}
+
+static LegalizerHelper::LegalizeResult
+createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+
+  // Add all the args, except for the last which is an imm denoting 'tail'.
+  // const CallLowering::ArgInfo &Result,
+  // Operand 0 & 1 are return: 0 is old val, 1 is success, 2-4 are reg operands:
+  // 2 is ptr, 3 is expected, 4 is new
+  Type *RetTy;
+  SmallVector<Register> RetRegs;
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    Register Success;
+    LLT SuccessLLT;
+    auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
+        MI.getFirst4RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
+      std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
+               NewLLT) = MI.getFirst5RegLLTs();
+      RetRegs.push_back(Success);
+      RetTy = StructType::get(
+          Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
+    }
+    Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
+    Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR: {
+    auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
+    RetRegs.push_back(Ret);
+    RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
+    if (Opc == TargetOpcode::G_ATOMICRMW_AND) {
+      Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
+      MIRBuilder.buildXor(Tmp, MIRBuilder.buildConstant(ValLLT, -1), Val);
+      Val = Tmp;
+    }
+    Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
+    Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
+    break;
+  }
+  default:
+    llvm_unreachable("unsupported opcode");
+  }
+
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  auto &AtomicMI = cast<GMemOperation>(MI);
+  auto Ordering = AtomicMI.getMMO().getMergedOrdering();
+  uint64_t MemSize = AtomicMI.getMemSize();
+  RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(Opc, Ordering, MemSize);
+  const char *Name = TLI.getLibcallName(RTLibcall);
+
+  // Unsupported libcall on the target.
+  if (!Name) {
+    LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
+                      << MIRBuilder.getTII().getName(Opc) << "\n");
+    return LegalizerHelper::UnableToLegalize;
+  }
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
+
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  return LegalizerHelper::Legalized;
+}
+
 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
                                        Type *FromType) {
   auto ToMVT = MVT::getVT(ToType);
@@ -1020,6 +1180,18 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR:
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    auto Status = createAtomicLibcall(MIRBuilder, MI);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
   case TargetOpcode::G_BZERO:
   case TargetOpcode::G_MEMCPY:
   case TargetOpcode::G_MEMMOVE:
@@ -3793,6 +3965,17 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerTRUNC(MI);
   GISEL_VECREDUCE_CASES_NONSEQ
     return lowerVectorReduction(MI);
+  case G_ATOMICRMW_SUB: {
+    auto Val = MI.getOperand(2).getReg();
+    LLT ValLLT = MRI.getType(Val);
+    Register Tmp = MRI.createGenericVirtualRegister(ValLLT);
+    MIRBuilder.buildSub(Tmp, MIRBuilder.buildConstant(ValLLT, 0), Val);
+    auto [Ret, Mem] = MI.getFirst2Regs();
+    auto &MMO = cast<GMemOperation>(MI).getMMO();
+    MIRBuilder.buildAtomicRMWAdd(Ret, Mem, Tmp, MMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 21a412e9360dc..7fce3e501db57 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -758,16 +758,39 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
 
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
+      .libcallIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
       .customIf([](const LegalityQuery &Query) {
         return Query.Types[0].getSizeInBits() == 128;
       })
       .clampScalar(0, s32, s64)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
+  getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
+                               G_ATOMICRMW_AND, G_ATOMICRMW_OR,
+                               G_ATOMICRMW_XOR})
+      .libcallIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
+      .clampScalar(0, s32, s64)
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+
+  getActionDefinitionsBuilder(G_ATOMICRMW_SUB)
+      .lowerIf([&ST](const LegalityQuery &Query) {
+        return ST.outlineAtomics() && !ST.hasLSE();
+      })
+      .clampScalar(0, s32, s64)
+      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
+
+  // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
+  // Don't outline them unless
+  // (1) high level <atomic> support approved:
+  //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
+  // (2) low level libgcc and compiler-rt support implemented by:
+  //   min/max outline atomics helpers
   getActionDefinitionsBuilder(
-      {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
-       G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
-       G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
+      {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
       .clampScalar(0, s32, s64)
       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
 
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
index fb4bef33d9b4f..fccafb29addbc 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-outline_atomics.ll
@@ -229,11 +229,7 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered:
 ; -O1:    ldxp x0, x1, [x8]
@@ -244,11 +240,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_unordered_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -259,11 +251,7 @@ define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
@@ -274,11 +262,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_relax
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_monotonic_const:
 ; -O1:    ldxp x0, x1, [x8]
@@ -289,11 +273,7 @@ define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %pt
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -304,11 +284,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_acquire_const:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -319,11 +295,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr)
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
@@ -334,11 +306,7 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
 ; -O0-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
+; -O0:    bl __aarch64_cas16_acq_rel
 ;
 ; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const:
 ; -O1:    ldaxp x0, x1, [x8]
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
index 3d204b734d4a0..e594561010464 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-outline_atomics.ll
@@ -117,14 +117,10 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_unordered:
@@ -136,14 +132,10 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_monotonic:
@@ -155,14 +147,10 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_release:
@@ -174,14 +162,10 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
index c660c139e35d4..e9b096e8c6c44 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
@@ -145,14 +145,10 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_relax
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
@@ -164,14 +160,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %val
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire:
@@ -183,14 +175,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release:
@@ -202,14 +190,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
+; -O0:    bl __aarch64_cas16_acq_rel
+; -O0:    eor x8, x0, x8
+; -O0:    eor x9, x1, x9
+; -O0:    orr x8, x8, x9
 ; -O0:    subs x8, x8, #0
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
@@ -221,14 +205,10 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/74588