[llvm] f1f5833 - [AArch64] Codegen for FEAT_LSE128

Wed Jan 25 04:02:44 PST 2023

Author: Tomas Matheson
Date: 2023-01-25T12:02:00Z
New Revision: f1f583347d00aad378eb0128e72d3d2e8be5174b

URL: https://github.com/llvm/llvm-project/commit/f1f583347d00aad378eb0128e72d3d2e8be5174b
DIFF: https://github.com/llvm/llvm-project/commit/f1f583347d00aad378eb0128e72d3d2e8be5174b.diff

LOG: [AArch64] Codegen for FEAT_LSE128

Codegen support for 128-bit atomicrmw (and|or|xchg).
      - store atomic -> swpp
      - atomicrmw xchg -> swpp
      - atomicrmw and -> ldclrp
      - atomicrmw or -> ldsetp

Reviewed By: lenary

Differential Revision: https://reviews.llvm.org/D141406

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-lse128.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse128.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse128.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse128.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6f2058c721577..c1c34c62ce2aa 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -821,6 +821,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 #undef LCALLNAME5
   }
 
+  if (Subtarget->hasLSE128()) {
+    // Custom lowering because i128 is not legal. Must be replaced by 2x64
+    // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
+    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
+  }
+
   // 128-bit loads and stores can be done without expanding
   setOperationAction(ISD::LOAD, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::i128, Custom);
@@ -13301,12 +13309,14 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  // No point replacing if we don't have the relevant instruction/libcall anyway
   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
     return SDValue();
 
   // LSE has an atomic load-clear instruction, but not a load-and.
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
+  assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
   SDValue RHS = Op.getOperand(2);
   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
   RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
@@ -22082,6 +22092,137 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
   Results.push_back(SDValue(CmpSwap, 3));
 }
 
+static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
+                                       AtomicOrdering Ordering) {
+  // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
+  // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
+  // the type is not legal. Therefore we shouldn't expect to see a 128-bit
+  // ATOMIC_LOAD_CLR at any point.
+  assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
+         "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
+  assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
+  assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
+
+  if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
+    // The operand will need to be XORed in a separate step.
+    switch (Ordering) {
+    case AtomicOrdering::Monotonic:
+      return AArch64::LDCLRP;
+      break;
+    case AtomicOrdering::Acquire:
+      return AArch64::LDCLRPA;
+      break;
+    case AtomicOrdering::Release:
+      return AArch64::LDCLRPL;
+      break;
+    case AtomicOrdering::AcquireRelease:
+    case AtomicOrdering::SequentiallyConsistent:
+      return AArch64::LDCLRPAL;
+      break;
+    default:
+      llvm_unreachable("Unexpected ordering!");
+    }
+  }
+
+  if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
+    switch (Ordering) {
+    case AtomicOrdering::Monotonic:
+      return AArch64::LDSETP;
+      break;
+    case AtomicOrdering::Acquire:
+      return AArch64::LDSETPA;
+      break;
+    case AtomicOrdering::Release:
+      return AArch64::LDSETPL;
+      break;
+    case AtomicOrdering::AcquireRelease:
+    case AtomicOrdering::SequentiallyConsistent:
+      return AArch64::LDSETPAL;
+      break;
+    default:
+      llvm_unreachable("Unexpected ordering!");
+    }
+  }
+
+  if (ISDOpcode == ISD::ATOMIC_SWAP) {
+    switch (Ordering) {
+    case AtomicOrdering::Monotonic:
+      return AArch64::SWPP;
+      break;
+    case AtomicOrdering::Acquire:
+      return AArch64::SWPPA;
+      break;
+    case AtomicOrdering::Release:
+      return AArch64::SWPPL;
+      break;
+    case AtomicOrdering::AcquireRelease:
+    case AtomicOrdering::SequentiallyConsistent:
+      return AArch64::SWPPAL;
+      break;
+    default:
+      llvm_unreachable("Unexpected ordering!");
+    }
+  }
+
+  llvm_unreachable("Unexpected ISDOpcode!");
+}
+
+static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG,
+                                          const AArch64Subtarget *Subtarget) {
+  // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
+  // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
+  // rather than the CASP instructions, because CASP has register classes for
+  // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
+  // to present them as single operands. LSE128 instructions use the GPR64
+  // register class (because the pair does not have to be sequential), like
+  // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
+
+  assert(N->getValueType(0) == MVT::i128 &&
+         "AtomicLoadXXX on types less than 128 should be legal");
+
+  if (!Subtarget->hasLSE128())
+    return;
+
+  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
+  const SDValue &Chain = N->getOperand(0);
+  const SDValue &Ptr = N->getOperand(1);
+  const SDValue &Val128 = N->getOperand(2);
+  std::pair<SDValue, SDValue> Val2x64 = splitInt128(Val128, DAG);
+
+  const unsigned ISDOpcode = N->getOpcode();
+  const unsigned MachineOpcode =
+      getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
+
+  if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
+    SDLoc dl(Val128);
+    Val2x64.first =
+        DAG.getNode(ISD::XOR, dl, MVT::i64,
+                    DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
+    Val2x64.second =
+        DAG.getNode(ISD::XOR, dl, MVT::i64,
+                    DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
+  }
+
+  SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Ops[0], Ops[1]);
+
+  MachineSDNode *AtomicInst =
+      DAG.getMachineNode(MachineOpcode, SDLoc(N),
+                         DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
+
+  DAG.setNodeMemRefs(AtomicInst, {MemOp});
+
+  SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
+  Results.push_back(SDValue(AtomicInst, 2)); // Chain out
+}
+
 void AArch64TargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
@@ -22135,6 +22276,20 @@ void AArch64TargetLowering::ReplaceNodeResults(
   case ISD::ATOMIC_CMP_SWAP:
     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
     return;
+  case ISD::ATOMIC_LOAD_CLR:
+    assert(N->getValueType(0) != MVT::i128 &&
+           "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_SWAP: {
+    AtomicSDNode *AN = cast<AtomicSDNode>(N);
+    assert(AN->getVal().getValueType() == MVT::i128 &&
+           "Expected 128-bit atomicrmw.");
+    // These need custom type legalisation so we go directly to instruction.
+    ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
+    return;
+  }
   case ISD::ATOMIC_LOAD:
   case ISD::LOAD: {
     MemSDNode *LoadNode = cast<MemSDNode>(N);
@@ -22368,6 +22523,13 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   if (Size > 128) return AtomicExpansionKind::None;
 
+  bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
+                      (AI->getOperation() == AtomicRMWInst::Xchg ||
+                       AI->getOperation() == AtomicRMWInst::Or ||
+                       AI->getOperation() == AtomicRMWInst::And);
+  if (CanUseLSE128)
+    return AtomicExpansionKind::None;
+
   // Nand is not supported in LSE.
   // Leave 128 bits to LLSC or CmpXChg.
   if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-lse128.ll
index e8437d86ae336..9ba794b674868 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-lse128.ll
@@ -116,69 +116,29 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 }
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_unordered:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    casp x6, x7, x0, x1, [x2]
-; -O1:    cmp x7, x5
-; -O1:    ccmp x6, x4, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_unordered:
+; CHECK:    swpp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr unordered, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    casp x6, x7, x0, x1, [x2]
-; -O1:    cmp x7, x5
-; -O1:    ccmp x6, x4, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
+; CHECK:    swpp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr monotonic, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_release:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    caspl x6, x7, x0, x1, [x2]
-; -O1:    cmp x7, x5
-; -O1:    ccmp x6, x4, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_release:
+; CHECK:    swppl x0, x1, [x2]
     store atomic i128 %value, ptr %ptr release, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    caspal x6, x7, x0, x1, [x2]
-; -O1:    cmp x7, x5
-; -O1:    ccmp x6, x4, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_seq_cst:
+; CHECK:    swppal x0, x1, [x2]
     store atomic i128 %value, ptr %ptr seq_cst, align 16
     ret void
 }
@@ -322,3 +282,6 @@ define dso_local void @store_atomic_i128_unaligned_seq_cst(i128 %value, ptr %ptr
     store atomic i128 %value, ptr %ptr seq_cst, align 1
     ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse128.ll
index 27aedd35c5d02..343ea36aeb73b 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse128.ll
@@ -145,85 +145,50 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    swpp x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    casp x4, x5, x2, x3, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    swpp x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O0:    caspa x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    swppa x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspa x4, x5, x2, x3, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    swppa x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    swppl x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspl x4, x5, x2, x3, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    swppl x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    swppal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspal x4, x5, x2, x3, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    swppal x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    swppal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspal x4, x5, x2, x3, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    swppal x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -1575,105 +1540,70 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic:
-; -O0:    and x2, x9, x11
-; -O0:    and x9, x9, x10
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrp x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    casp x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrp x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire:
-; -O0:    and x2, x9, x11
-; -O0:    and x9, x9, x10
-; -O0:    caspa x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpa x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspa x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpa x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_release:
-; -O0:    and x2, x9, x11
-; -O0:    and x9, x9, x10
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpl x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspl x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpl x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel:
-; -O0:    and x2, x9, x11
-; -O0:    and x9, x9, x10
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpal x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst:
-; -O0:    and x2, x9, x11
-; -O0:    and x9, x9, x10
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpal x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2955,105 +2885,50 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic:
-; -O0:    orr x2, x9, x11
-; -O0:    orr x9, x9, x10
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    ldsetp x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    casp x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    ldsetp x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire:
-; -O0:    orr x2, x9, x11
-; -O0:    orr x9, x9, x10
-; -O0:    caspa x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    ldsetpa x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspa x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    ldsetpa x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_release:
-; -O0:    orr x2, x9, x11
-; -O0:    orr x9, x9, x10
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    ldsetpl x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspl x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    ldsetpl x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel:
-; -O0:    orr x2, x9, x11
-; -O0:    orr x9, x9, x10
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    ldsetpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    ldsetpal x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst:
-; -O0:    orr x2, x9, x11
-; -O0:    orr x9, x9, x10
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
+; -O0:    ldsetpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x5, x7
-; -O1:    ccmp x4, x6, #0, eq
+; -O1:    ldsetpal x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse128.ll
index 913abeb20f9bc..587f04baf11dc 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse128.ll
@@ -116,61 +116,29 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 }
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_unordered:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    casp x6, x7, x0, x1, [x2]
-; -O1:    cmp x6, x4
-; -O1:    ccmp x7, x5, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_unordered:
+; CHECK:    swpp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr unordered, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    casp x6, x7, x0, x1, [x2]
-; -O1:    cmp x6, x4
-; -O1:    ccmp x7, x5, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
+; CHECK:    swpp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr monotonic, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_release:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    caspl x6, x7, x0, x1, [x2]
-; -O1:    cmp x6, x4
-; -O1:    ccmp x7, x5, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_release:
+; CHECK:    swppl x0, x1, [x2]
     store atomic i128 %value, ptr %ptr release, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x2]
-; -O1:    caspal x6, x7, x0, x1, [x2]
-; -O1:    cmp x6, x4
-; -O1:    ccmp x7, x5, #0, eq
+; CHECK-LABEL: store_atomic_i128_aligned_seq_cst:
+; CHECK:    swppal x0, x1, [x2]
     store atomic i128 %value, ptr %ptr seq_cst, align 16
     ret void
 }
@@ -314,3 +282,6 @@ define dso_local void @store_atomic_i128_unaligned_seq_cst(i128 %value, ptr %ptr
     store atomic i128 %value, ptr %ptr seq_cst, align 1
     ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse128.ll
index 9d4222ec6ade6..18c43124f3236 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse128.ll
@@ -145,75 +145,50 @@ define dso_local i64 @atomicrmw_xchg_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    swpp x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    casp x4, x5, x2, x3, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    swpp x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O0:    caspa x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    swppa x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspa x4, x5, x2, x3, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    swppa x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    swppl x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspl x4, x5, x2, x3, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    swppl x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    swppal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspal x4, x5, x2, x3, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    swppal x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    swppal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    caspal x4, x5, x2, x3, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    swppal x2, x1, [x0]
     %r = atomicrmw xchg ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -1665,95 +1640,70 @@ define dso_local i64 @atomicrmw_and_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic:
-; -O0:    and x2, x11, x12
-; -O0:    and x9, x10, x9
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrp x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    casp x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrp x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire:
-; -O0:    and x2, x11, x12
-; -O0:    and x9, x10, x9
-; -O0:    caspa x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpa x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspa x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpa x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_release:
-; -O0:    and x2, x11, x12
-; -O0:    and x9, x10, x9
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpl x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspl x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpl x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel:
-; -O0:    and x2, x11, x12
-; -O0:    and x9, x10, x9
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpal x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst:
-; -O0:    and x2, x11, x12
-; -O0:    and x9, x10, x9
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    mvn x1, x3
+; -O0:    mvn x0, x2
+; -O0:    ldclrpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    and x8, x4, x2
-; -O1:    and x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    mvn x1, x3
+; -O1:    mvn x8, x2
+; -O1:    ldclrpal x8, x1, [x0]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -3040,95 +2990,50 @@ define dso_local i64 @atomicrmw_or_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 
 define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic:
-; -O0:    orr x2, x11, x12
-; -O0:    orr x9, x10, x9
-; -O0:    casp x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    ldsetp x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    casp x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    ldsetp x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire:
-; -O0:    orr x2, x11, x12
-; -O0:    orr x9, x10, x9
-; -O0:    caspa x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    ldsetpa x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspa x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    ldsetpa x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_release:
-; -O0:    orr x2, x11, x12
-; -O0:    orr x9, x10, x9
-; -O0:    caspl x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    ldsetpl x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspl x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    ldsetpl x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel:
-; -O0:    orr x2, x11, x12
-; -O0:    orr x9, x10, x9
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    ldsetpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    ldsetpal x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
 
 define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst:
-; -O0:    orr x2, x11, x12
-; -O0:    orr x9, x10, x9
-; -O0:    caspal x0, x1, x2, x3, [x8]
-; -O0:    subs x11, x9, x11
-; -O0:    ccmp x8, x10, #0, eq
+; -O0:    ldsetpal x0, x1, [x8]
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
-; -O1:    ldp x4, x5, [x0]
-; -O1:    orr x8, x4, x2
-; -O1:    orr x9, x7, x3
-; -O1:    caspal x4, x5, x8, x9, [x0]
-; -O1:    cmp x4, x6
-; -O1:    ccmp x5, x7, #0, eq
+; -O1:    ldsetpal x2, x1, [x0]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }