[llvm] 7c84f94 - [AArch64] Codegen for FEAT_LRCPC3

Wed Jan 25 04:28:31 PST 2023

Author: Tomas Matheson
Date: 2023-01-25T12:27:23Z
New Revision: 7c84f94eb9f9c39229ca26d21d3bcb0b35efdc71

URL: https://github.com/llvm/llvm-project/commit/7c84f94eb9f9c39229ca26d21d3bcb0b35efdc71
DIFF: https://github.com/llvm/llvm-project/commit/7c84f94eb9f9c39229ca26d21d3bcb0b35efdc71.diff

LOG: [AArch64] Codegen for FEAT_LRCPC3

Implements support for the following 128-bit atomic operations with +rcpc3:
 - 128-bit store-release -> STILP
 - 128-bit load-acquire -> LDIAPP

D126250 and D137590 added support for emitting LDAPR (Load-Acquire RCPc) rather
than LDAP (Load-Acquire) when +rcpc is available. This patch allows emitting
the 128-bit RCPc instructions added in FEAT_LRCPC3 (LDIAPP/STILP). The
implementation is different from LDAPR, because there are no non-RCPc
equivalents for these new instructions.

Support for the offset variants will be added in D141431.

Reviewed By: lenary

Differential Revision: https://reviews.llvm.org/D141429

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-fence.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-cmpxchg-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-fence.ll
    llvm/test/CodeGen/AArch64/Atomics/generate-tests.py

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c1c34c62ce2aa..46db20ecdd742 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -834,7 +834,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::i128, Custom);
 
   // Aligned 128-bit loads and stores are single-copy atomic according to the
-  // v8.4a spec.
+  // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
   if (Subtarget->hasLSE2()) {
     setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
     setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
@@ -2572,8 +2572,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::SSTNT1_PRED)
     MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
     MAKE_CASE(AArch64ISD::LDP)
+    MAKE_CASE(AArch64ISD::LDIAPP)
     MAKE_CASE(AArch64ISD::LDNP)
     MAKE_CASE(AArch64ISD::STP)
+    MAKE_CASE(AArch64ISD::STILP)
     MAKE_CASE(AArch64ISD::STNP)
     MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
@@ -5698,9 +5700,14 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
   MemSDNode *StoreNode = cast<MemSDNode>(Op);
   assert(StoreNode->getMemoryVT() == MVT::i128);
   assert(StoreNode->isVolatile() || StoreNode->isAtomic());
-  assert(!StoreNode->isAtomic() ||
-         StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
-         StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
+
+  bool IsStoreRelease =
+      StoreNode->getMergedOrdering() == AtomicOrdering::Release;
+  if (StoreNode->isAtomic())
+    assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
+            Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
+           StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
+           StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
 
   SDValue Value = StoreNode->getOpcode() == ISD::STORE
                       ? StoreNode->getOperand(1)
@@ -5710,8 +5717,10 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
                            DAG.getConstant(0, DL, MVT::i64));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
                            DAG.getConstant(1, DL, MVT::i64));
+
+  unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
   SDValue Result = DAG.getMemIntrinsicNode(
-      AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
+      Opcode, DL, DAG.getVTList(MVT::Other),
       {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
       StoreNode->getMemoryVT(), StoreNode->getMemOperand());
   return Result;
@@ -5984,7 +5993,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::ATOMIC_STORE:
     if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
-      assert(Subtarget->hasLSE2());
+      assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
       return LowerStore128(Op, DAG);
     }
     return SDValue();
@@ -22325,9 +22334,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
     }
 
     if (SDValue(N, 0).getValueType() == MVT::i128) {
+      auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
+      bool isLoadAcquire =
+          AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
+      unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
+
+      if (isLoadAcquire)
+        assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
+
       SDValue Result = DAG.getMemIntrinsicNode(
-          AArch64ISD::LDP, SDLoc(N),
-          DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
+          Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
           {LoadNode->getChain(), LoadNode->getBasePtr()},
           LoadNode->getMemoryVT(), LoadNode->getMemOperand());
 
@@ -22450,8 +22466,27 @@ bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
   return false;
 }
 
+bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {
+  if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
+    return false;
+
+  if (auto LI = dyn_cast<LoadInst>(I))
+    return LI->getType()->getPrimitiveSizeInBits() == 128 &&
+           LI->getAlign() >= Align(16) &&
+           LI->getOrdering() == AtomicOrdering::Acquire;
+
+  if (auto SI = dyn_cast<StoreInst>(I))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
+           SI->getAlign() >= Align(16) &&
+           SI->getOrdering() == AtomicOrdering::Release;
+
+  return false;
+}
+
 bool AArch64TargetLowering::shouldInsertFencesForAtomic(
     const Instruction *I) const {
+  if (isOpSuitableForRCPC3(I))
+    return false;
   return isOpSuitableForLDPSTP(I);
 }
 
@@ -22485,7 +22520,7 @@ bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
 TargetLoweringBase::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-  if (Size != 128 || isOpSuitableForLDPSTP(SI))
+  if (Size != 128 || isOpSuitableForLDPSTP(SI) || isOpSuitableForRCPC3(SI))
     return AtomicExpansionKind::None;
   return AtomicExpansionKind::Expand;
 }
@@ -22497,7 +22532,7 @@ TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
 
-  if (Size != 128 || isOpSuitableForLDPSTP(LI))
+  if (Size != 128 || isOpSuitableForLDPSTP(LI) || isOpSuitableForRCPC3(LI))
     return AtomicExpansionKind::None;
 
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0edec721ed879..92619f7e4c5a3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -475,8 +475,10 @@ enum NodeType : unsigned {
   STZ2G,
 
   LDP,
+  LDIAPP,
   LDNP,
   STP,
+  STILP,
   STNP,
 
   // Memory Operations
@@ -705,6 +707,7 @@ class AArch64TargetLowering : public TargetLowering {
   void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
 
   bool isOpSuitableForLDPSTP(const Instruction *I) const;
+  bool isOpSuitableForRCPC3(const Instruction *I) const;
   bool shouldInsertFencesForAtomic(const Instruction *I) const override;
   bool
   shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override;

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 17fc90afcaab6..5754546627152 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -362,8 +362,10 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
 def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 
 def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64ldiapp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stilp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 
 // Generates the general dynamic sequences, i.e.
@@ -786,8 +788,10 @@ def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
 def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
 
 def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def AArch64ldiapp : SDNode<"AArch64ISD::LDIAPP", SDT_AArch64ldiapp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def AArch64ldnp : SDNode<"AArch64ISD::LDNP", SDT_AArch64ldnp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
@@ -3408,7 +3412,7 @@ def STPSpre : StorePairPreIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
 def STPDpre : StorePairPreIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
 def STPQpre : StorePairPreIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
 
-// Pair (pre-indexed)
+// Pair (post-indexed)
 def STPWpost : StorePairPostIdx<0b00, 0, GPR32z, simm7s4, "stp">;
 def STPXpost : StorePairPostIdx<0b10, 0, GPR64z, simm7s8, "stp">;
 def STPSpost : StorePairPostIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
@@ -8684,6 +8688,9 @@ let Predicates = [HasRCPC3] in {
   def LDIAPPW:    BaseLRCPC3IntegerLoadStorePair<0b10, 0b01, 0b0001, (outs GPR32:$Rt, GPR32:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">;
   def LDIAPPX:    BaseLRCPC3IntegerLoadStorePair<0b11, 0b01, 0b0001, (outs GPR64:$Rt, GPR64:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">;
 
+  def : Pat<(AArch64ldiapp GPR64sp:$Rn), (LDIAPPX GPR64sp:$Rn)>;
+  def : Pat<(AArch64stilp GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn), (STILPX GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn)>;
+
   // Aliases for when offset=0
   def : InstAlias<"stilp\t$Rt, $Rt2, [$Rn, #0]", (STILPW GPR32: $Rt, GPR32: $Rt2, GPR64sp:$Rn)>;
   def : InstAlias<"stilp\t$Rt, $Rt2, [$Rn, #0]", (STILPX GPR64: $Rt, GPR64: $Rt2, GPR64sp:$Rn)>;

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 186d0ed35d56f..7e5fd1a5f6bdf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -79,6 +79,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
 
   const bool HasCSSC = ST.hasCSSC();
+  const bool HasRCPC3 = ST.hasRCPC3();
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
       .legalFor({p0, s8, s16, s32, s64})
@@ -309,6 +310,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   };
 
   getActionDefinitionsBuilder(G_LOAD)
+      .customIf([=](const LegalityQuery &Query) {
+        return HasRCPC3 && Query.Types[0] == s128 &&
+               Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
+      })
       .customIf([=](const LegalityQuery &Query) {
         return Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
@@ -328,16 +333,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                                  {v2s64, p0, s128, 8}})
       // These extends are also legal
       .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
-      .widenScalarToNextPow2(0, /* MinSize = */8)
+      .widenScalarToNextPow2(0, /* MinSize = */ 8)
       .lowerIfMemSizeNotByteSizePow2()
       .clampScalar(0, s8, s64)
-      .narrowScalarIf([=](const LegalityQuery &Query) {
-        // Clamp extending load results to 32-bits.
-        return Query.Types[0].isScalar() &&
-          Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
-          Query.Types[0].getSizeInBits() > 32;
-        },
-        changeTo(0, s32))
+      .narrowScalarIf(
+          [=](const LegalityQuery &Query) {
+            // Clamp extending load results to 32-bits.
+            return Query.Types[0].isScalar() &&
+                   Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
+                   Query.Types[0].getSizeInBits() > 32;
+          },
+          changeTo(0, s32))
       .clampMaxNumElements(0, s8, 16)
       .clampMaxNumElements(0, s16, 8)
       .clampMaxNumElements(0, s32, 4)
@@ -347,31 +353,25 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarizeIf(typeIs(0, v2s16), 0);
 
   getActionDefinitionsBuilder(G_STORE)
+      .customIf([=](const LegalityQuery &Query) {
+        return HasRCPC3 && Query.Types[0] == s128 &&
+               Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
+      })
       .customIf([=](const LegalityQuery &Query) {
         return Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
       })
-      .legalForTypesWithMemDesc({{s8, p0, s8, 8},
-                                 {s16, p0, s8, 8}, // truncstorei8 from s16
-                                 {s32, p0, s8, 8}, // truncstorei8 from s32
-                                 {s64, p0, s8, 8}, // truncstorei8 from s64
-                                 {s16, p0, s16, 8},
-                                 {s32, p0, s16, 8}, // truncstorei16 from s32
-                                 {s64, p0, s16, 8}, // truncstorei16 from s64
-                                 {s32, p0, s8, 8},
-                                 {s32, p0, s16, 8},
-                                 {s32, p0, s32, 8},
-                                 {s64, p0, s64, 8},
-                                 {s64, p0, s32, 8}, // truncstorei32 from s64
-                                 {p0, p0, s64, 8},
-                                 {s128, p0, s128, 8},
-                                 {v16s8, p0, s128, 8},
-                                 {v8s8, p0, s64, 8},
-                                 {v4s16, p0, s64, 8},
-                                 {v8s16, p0, s128, 8},
-                                 {v2s32, p0, s64, 8},
-                                 {v4s32, p0, s128, 8},
-                                 {v2s64, p0, s128, 8}})
+      .legalForTypesWithMemDesc(
+          {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
+           {s32, p0, s8, 8},                       // truncstorei8 from s32
+           {s64, p0, s8, 8},                       // truncstorei8 from s64
+           {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
+           {s64, p0, s16, 8},                      // truncstorei16 from s64
+           {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
+           {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
+           {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
+           {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
+           {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
@@ -1188,27 +1188,49 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
   const LLT ValTy = MRI.getType(ValReg);
 
   if (ValTy == LLT::scalar(128)) {
-    assert((*MI.memoperands_begin())->getSuccessOrdering() ==
-               AtomicOrdering::Monotonic ||
-           (*MI.memoperands_begin())->getSuccessOrdering() ==
-               AtomicOrdering::Unordered);
-    assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
+
+    AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
+    bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
+    bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
+    bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
+    bool IsRcpC3 =
+        ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
+
     LLT s64 = LLT::scalar(64);
+
+    unsigned Opcode;
+    if (IsRcpC3) {
+      Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
+    } else {
+      // For LSE2, loads/stores should have been converted to monotonic and had
+      // a fence inserted after them.
+      assert(Ordering == AtomicOrdering::Monotonic ||
+             Ordering == AtomicOrdering::Unordered);
+      assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
+
+      Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
+    }
+
     MachineInstrBuilder NewI;
-    if (MI.getOpcode() == TargetOpcode::G_LOAD) {
-      NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
+    if (IsLoad) {
+      NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
       MIRBuilder.buildMergeLikeInstr(
           ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
     } else {
       auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
       NewI = MIRBuilder.buildInstr(
-          AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
+          Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
+    }
+
+    if (IsRcpC3) {
+      NewI.addUse(MI.getOperand(1).getReg());
+    } else {
+      Register Base;
+      int Offset;
+      matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
+      NewI.addUse(Base);
+      NewI.addImm(Offset / 8);
     }
-    Register Base;
-    int Offset;
-    matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
-    NewI.addUse(Base);
-    NewI.addImm(Offset / 8);
 
     NewI.cloneMemRefs(MI);
     constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll
index df73e52c0c1d9..435f0a0c01765 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @load_atomic_i8_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i8_aligned_unordered:
@@ -228,121 +228,59 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 }
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_unordered:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_unordered:
-; -O1:    ldxp x0, x1, [x8]
-; -O1:    stxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_unordered:
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O1:    ldxp x0, x1, [x8]
-; -O1:    stxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_unordered_const:
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_monotonic:
-; -O1:    ldxp x0, x1, [x8]
-; -O1:    stxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_monotonic:
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O0:    ldxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O1:    ldxp x0, x1, [x8]
-; -O1:    stxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_monotonic_const:
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_acquire:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_acquire:
-; -O1:    ldaxp x0, x1, [x8]
-; -O1:    stxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_acquire:
+; CHECK:    ldiapp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O1:    ldaxp x0, x1, [x8]
-; -O1:    stxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_acquire_const:
+; CHECK:    ldiapp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O1:    ldaxp x0, x1, [x8]
-; -O1:    stlxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_seq_cst:
+; CHECK:    ldp x0, x1, [x0]
+; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O0:    ldaxp x0, x1, [x9]
-; -O0:    cmp x0, x10
-; -O0:    cmp x1, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x0, x1, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O1:    ldaxp x0, x1, [x8]
-; -O1:    stlxp w9, x0, x1, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_seq_cst_const:
+; CHECK:    ldp x0, x1, [x0]
+; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
 }
@@ -626,3 +564,6 @@ define dso_local i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %pt
     %r = load atomic i128, ptr %ptr seq_cst, align 1
     ret i128 %r
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll
index cf7d512548b14..8e1ae1d0a754c 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local void @store_atomic_i8_aligned_unordered(i8 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i8_aligned_unordered:
@@ -116,77 +116,31 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 }
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_unordered:
-; -O1:    ldxp xzr, x8, [x2]
-; -O1:    stxp w8, x0, x1, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_unordered:
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr unordered, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stxp w8, x14, x15, [x11]
-; -O0:    stxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_monotonic:
-; -O1:    ldxp xzr, x8, [x2]
-; -O1:    stxp w8, x0, x1, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr monotonic, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_release:
-; -O1:    ldxp xzr, x8, [x2]
-; -O1:    stlxp w8, x0, x1, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_release:
+; CHECK:    stilp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr release, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x9, [x11]
-; -O0:    cmp x10, x12
-; -O0:    cmp x9, x13
-; -O0:    stlxp w8, x14, x15, [x11]
-; -O0:    stlxp w8, x10, x9, [x11]
-; -O0:    eor x8, x10, x8
-; -O0:    eor x11, x9, x11
-; -O0:    orr x8, x8, x11
-; -O0:    subs x8, x8, #0
-;
-; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O1:    ldaxp xzr, x8, [x2]
-; -O1:    stlxp w8, x0, x1, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_seq_cst:
+; CHECK:    dmb ish
+; CHECK:    stp x0, x1, [x2]
+; CHECK:    dmb ish
     store atomic i128 %value, ptr %ptr seq_cst, align 16
     ret void
 }
@@ -330,3 +284,6 @@ define dso_local void @store_atomic_i128_unaligned_seq_cst(i128 %value, ptr %ptr
     store atomic i128 %value, ptr %ptr seq_cst, align 1
     ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
index 838f4529c4dc7..5004db8d26560 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @atomicrmw_xchg_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i8_aligned_monotonic:

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-rcpc3.ll
index 595bbaba4457a..2d56cf802c4f8 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-cmpxchg-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_monotonic(i8 %expected, i8 %new, ptr %ptr) {
 ; -O0-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-fence.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-fence.ll
index db00aca9110b6..a0928b38b6687 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-fence.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-fence.ll
@@ -10,8 +10,8 @@
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+outline-atomics -O0 | FileCheck %s --check-prefixes=CHECK,-O0
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+outline-atomics -O1 | FileCheck %s --check-prefixes=CHECK,-O1
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse128 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse128 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll
index 517baa6f2f8b6..a09b4c69755d5 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @load_atomic_i8_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i8_aligned_unordered:
@@ -228,121 +228,59 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 }
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_unordered:
-; -O0:    ldxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_unordered:
-; -O1:    ldxp x1, x0, [x8]
-; -O1:    stxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_unordered:
+; CHECK:    ldp x1, x0, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O0:    ldxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_unordered_const:
-; -O1:    ldxp x1, x0, [x8]
-; -O1:    stxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_unordered_const:
+; CHECK:    ldp x1, x0, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_monotonic:
-; -O1:    ldxp x1, x0, [x8]
-; -O1:    stxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_monotonic:
+; CHECK:    ldp x1, x0, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O0:    ldxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_monotonic_const:
-; -O1:    ldxp x1, x0, [x8]
-; -O1:    stxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_monotonic_const:
+; CHECK:    ldp x1, x0, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_acquire:
-; -O0:    ldaxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_acquire:
-; -O1:    ldaxp x1, x0, [x8]
-; -O1:    stxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_acquire:
+; CHECK:    ldiapp x1, x0, [x0]
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O0:    ldaxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stxp w8, x10, x10, [x9]
-; -O0:    stxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_acquire_const:
-; -O1:    ldaxp x1, x0, [x8]
-; -O1:    stxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_acquire_const:
+; CHECK:    ldiapp x1, x0, [x0]
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_seq_cst:
-; -O1:    ldaxp x1, x0, [x8]
-; -O1:    stlxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_seq_cst:
+; CHECK:    ldp x1, x0, [x0]
+; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
-; -O0-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O0:    ldaxp x1, x0, [x9]
-; -O0:    cmp x1, x10
-; -O0:    cmp x0, x10
-; -O0:    stlxp w8, x10, x10, [x9]
-; -O0:    stlxp w8, x1, x0, [x9]
-;
-; -O1-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; -O1:    ldaxp x1, x0, [x8]
-; -O1:    stlxp w9, x1, x0, [x8]
+; CHECK-LABEL: load_atomic_i128_aligned_seq_cst_const:
+; CHECK:    ldp x1, x0, [x0]
+; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
 }
@@ -626,3 +564,6 @@ define dso_local i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %pt
     %r = load atomic i128, ptr %ptr seq_cst, align 1
     ret i128 %r
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll
index 8e067fc74b56b..f8f8fe7cd6fa5 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local void @store_atomic_i8_aligned_unordered(i8 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i8_aligned_unordered:
@@ -116,69 +116,31 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 }
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x10, x12, [x9]
-; -O0:    cmp x10, x11
-; -O0:    cmp x12, x13
-; -O0:    stxp w8, x14, x15, [x9]
-; -O0:    stxp w8, x10, x12, [x9]
-; -O0:    subs x12, x12, x13
-; -O0:    ccmp x10, x11, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_unordered:
-; -O1:    ldxp xzr, x8, [x2]
-; -O1:    stxp w8, x1, x0, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_unordered:
+; CHECK:    stp x1, x0, [x2]
     store atomic i128 %value, ptr %ptr unordered, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x10, x12, [x9]
-; -O0:    cmp x10, x11
-; -O0:    cmp x12, x13
-; -O0:    stxp w8, x14, x15, [x9]
-; -O0:    stxp w8, x10, x12, [x9]
-; -O0:    subs x12, x12, x13
-; -O0:    ccmp x10, x11, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_monotonic:
-; -O1:    ldxp xzr, x8, [x2]
-; -O1:    stxp w8, x1, x0, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
+; CHECK:    stp x1, x0, [x2]
     store atomic i128 %value, ptr %ptr monotonic, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x10, x12, [x9]
-; -O0:    cmp x10, x11
-; -O0:    cmp x12, x13
-; -O0:    stlxp w8, x14, x15, [x9]
-; -O0:    stlxp w8, x10, x12, [x9]
-; -O0:    subs x12, x12, x13
-; -O0:    ccmp x10, x11, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_release:
-; -O1:    ldxp xzr, x8, [x2]
-; -O1:    stlxp w8, x1, x0, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_release:
+; CHECK:    stilp x1, x0, [x2]
     store atomic i128 %value, ptr %ptr release, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
-; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x10, x12, [x9]
-; -O0:    cmp x10, x11
-; -O0:    cmp x12, x13
-; -O0:    stlxp w8, x14, x15, [x9]
-; -O0:    stlxp w8, x10, x12, [x9]
-; -O0:    subs x12, x12, x13
-; -O0:    ccmp x10, x11, #0, eq
-;
-; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O1:    ldaxp xzr, x8, [x2]
-; -O1:    stlxp w8, x1, x0, [x2]
+; CHECK-LABEL: store_atomic_i128_aligned_seq_cst:
+; CHECK:    dmb ish
+; CHECK:    stp x1, x0, [x2]
+; CHECK:    dmb ish
     store atomic i128 %value, ptr %ptr seq_cst, align 16
     ret void
 }
@@ -322,3 +284,6 @@ define dso_local void @store_atomic_i128_unaligned_seq_cst(i128 %value, ptr %ptr
     store atomic i128 %value, ptr %ptr seq_cst, align 1
     ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; -O0: {{.*}}
+; -O1: {{.*}}

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
index ec0300264b06a..9458345fef72e 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @atomicrmw_xchg_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_xchg_i8_aligned_monotonic:

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-cmpxchg-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-cmpxchg-rcpc3.ll
index 804c9c7cb10e9..decae2be2cffd 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-cmpxchg-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-cmpxchg-rcpc3.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 
 define dso_local i8 @cmpxchg_i8_aligned_monotonic_monotonic(i8 %expected, i8 %new, ptr %ptr) {
 ; -O0-LABEL: cmpxchg_i8_aligned_monotonic_monotonic:

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-fence.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-fence.ll
index 7a9b5e22e46ae..82c1042c64a5f 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-fence.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-fence.ll
@@ -10,8 +10,8 @@
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+outline-atomics -O0 | FileCheck %s --check-prefixes=CHECK,-O0
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+outline-atomics -O1 | FileCheck %s --check-prefixes=CHECK,-O1
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse2,+rcpc3 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse128 -O0 | FileCheck %s --check-prefixes=CHECK,-O0
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lse128 -O1 | FileCheck %s --check-prefixes=CHECK,-O1
 

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py b/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
index 9104dab0cfda2..d0bc0a0435203 100755
--- a/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
+++ b/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py
@@ -105,21 +105,24 @@ def __str__(self) -> str:
 
 
 class Feature(enum.Flag):
+    # Feature names in filenames are determined by the spelling here:
     v8a = enum.auto()
     v8_1a = enum.auto()  # -mattr=+v8.1a, mandatory FEAT_LOR, FEAT_LSE
     rcpc = enum.auto()  # FEAT_LRCPC
     lse2 = enum.auto()  # FEAT_LSE2
     outline_atomics = enum.auto()  # -moutline-atomics
-    rcpc3 = enum.auto()  # FEAT_LRCPC3
+    rcpc3 = enum.auto()  # FEAT_LSE2 + FEAT_LRCPC3
     lse128 = enum.auto()  # FEAT_LSE128
 
     @property
     def mattr(self):
         if self == Feature.outline_atomics:
-            return 'outline-atomics'
+            return '+outline-atomics'
         if self == Feature.v8_1a:
-            return 'v8.1a'
-        return self.name
+            return '+v8.1a'
+        if self == Feature.rcpc3:
+            return '+lse2,+rcpc3'
+        return '+' + self.name
 
 
 ATOMICRMW_OPS = [
@@ -230,7 +233,7 @@ def header(f, triple, features, filter_args: str):
         for OptFlag in ['-O0', '-O1']:
             f.write(' '.join([
                 ';', 'RUN:', 'llc', '%s', '-o', '-', '-verify-machineinstrs',
-                f'-mtriple={triple}', f'-mattr=+{feat.mattr}', OptFlag, '|',
+                f'-mtriple={triple}', f'-mattr={feat.mattr}', OptFlag, '|',
                 'FileCheck', '%s', f'--check-prefixes=CHECK,{OptFlag}\n'
             ]))