[llvm] 34c098b - [ARM] Prevent spilling between ldrex/strex pairs

Wed May 12 01:43:50 PDT 2021

Author: Tomas Matheson
Date: 2021-05-12T09:43:21+01:00
New Revision: 34c098b780a27a90b5614ea3b949b9269835f2a5

URL: https://github.com/llvm/llvm-project/commit/34c098b780a27a90b5614ea3b949b9269835f2a5
DIFF: https://github.com/llvm/llvm-project/commit/34c098b780a27a90b5614ea3b949b9269835f2a5.diff

LOG: [ARM] Prevent spilling between ldrex/strex pairs

Based on the same for AArch64: 4751cadcca45984d7671e594ce95aed8fe030bf1

At -O0, the fast register allocator may insert spills between the ldrex and
strex instructions inserted by AtomicExpandPass when expanding atomicrmw
instructions in LL/SC loops. To avoid this, expand to cmpxchg loops and
therefore expand the cmpxchg pseudos after register allocation.

Required a tweak to ARMExpandPseudo::ExpandCMP_SWAP to use the 4-byte encoding
of UXT, since the pseudo instruction can be allocated a high register (R8-R15)
which the 2-byte encoding doesn't support. However, the 4-byte encodings
are not present for ARM v8-M Baseline. To enable this, two new pseudos are
added for Thumb which are only valid for v8mbase, tCMP_SWAP_8 and
tCMP_SWAP_16.

The previously committed attempt in D101164 had to be reverted due to runtime
failures in the test suites. Rather than spending time fixing that
implementation (adding another implementation of atomic operations and more
divergence between backends) I have chosen to follow the approach taken in
D101163.

Differential Revision: https://reviews.llvm.org/D101898

Depends on D101912

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
    llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMInstrThumb.td
    llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 5fe8e96fa2ec3..5764ddd596889 100644

--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1566,6 +1566,15 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   Register DesiredReg = MI.getOperand(3).getReg();
   Register NewReg = MI.getOperand(4).getReg();
 
+  if (IsThumb) {
+    assert(STI->hasV8MBaselineOps() &&
+           "CMP_SWAP not expected to be custom expanded for Thumb1");
+    assert((UxtOp == 0 || UxtOp == ARM::tUXTB || UxtOp == ARM::tUXTH) &&
+           "ARMv8-M.baseline does not have t2UXTB/t2UXTH");
+    assert(ARM::tGPRRegClass.contains(DesiredReg) &&
+           "DesiredReg used for UXT op must be tGPR");
+  }
+
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
   auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -2779,20 +2788,23 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
     case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
 
+    case ARM::tCMP_SWAP_8:
+      assert(STI->isThumb());
+      return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, ARM::tUXTB,
+                            NextMBBI);
+    case ARM::tCMP_SWAP_16:
+      assert(STI->isThumb());
+      return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, ARM::tUXTH,
+                            NextMBBI);
+
     case ARM::CMP_SWAP_8:
-      if (STI->isThumb())
-        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
-                              ARM::tUXTB, NextMBBI);
-      else
-        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
-                              ARM::UXTB, NextMBBI);
+      assert(!STI->isThumb());
+      return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB, ARM::UXTB,
+                            NextMBBI);
     case ARM::CMP_SWAP_16:
-      if (STI->isThumb())
-        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
-                              ARM::tUXTH, NextMBBI);
-      else
-        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
-                              ARM::UXTH, NextMBBI);
+      assert(!STI->isThumb());
+      return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, ARM::UXTH,
+                            NextMBBI);
     case ARM::CMP_SWAP_32:
       if (STI->isThumb())
         return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,

diff  --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6149aec451c4d..5024f47459a3f 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3299,9 +3299,9 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
   EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
   if (MemTy == MVT::i8)
-    Opcode = ARM::CMP_SWAP_8;
+    Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_8 : ARM::CMP_SWAP_8;
   else if (MemTy == MVT::i16)
-    Opcode = ARM::CMP_SWAP_16;
+    Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_16 : ARM::CMP_SWAP_16;
   else if (MemTy == MVT::i32)
     Opcode = ARM::CMP_SWAP_32;
   else

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index cae4b9babecb9..f9f0b57f4668e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -19359,6 +19359,14 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   if (AI->isFloatingPointOperation())
     return AtomicExpansionKind::CmpXChg;
 
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement atomicrmw without spilling. If the target address is also on the
+  // stack and close enough to the spill slot, this can lead to a situation
+  // where the monitor always gets cleared and the atomic operation can never
+  // succeed. So at -O0 lower this operation to a CAS loop.
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    return AtomicExpansionKind::CmpXChg;
+
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)

diff  --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index 64d4dc0b112ab..ef07b2839bc93 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1766,3 +1766,21 @@ def : tInstAlias<"asr${s}${p} $Rdm, $imm",
 def tLDRConstPool
   : tAsmPseudo<"ldr${p} $Rt, $immediate",
                (ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// See ARMInstrInfo.td. These two thumb specific pseudos are required to
+// restrict the register class for the UXTB/UXTH ops used in the expansion.
+
+let Constraints = "@earlyclobber $Rd, at earlyclobber $temp",
+    mayLoad = 1, mayStore = 1 in {
+def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+                             (ins GPR:$addr, tGPR:$desired, GPR:$new),
+                             NoItinerary, []>, Sched<[]>;
+
+def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+                              (ins GPR:$addr, tGPR:$desired, GPR:$new),
+                              NoItinerary, []>, Sched<[]>;
+}

diff  --git a/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll
index 12b270e055e7d..277843cb76e71 100644
--- a/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll
+++ b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll
@@ -16,7 +16,7 @@
 define i8 @test_xchg_i8() {
 ; COMMON-LABEL: test_xchg_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_lock_test_and_set_1
 entry:
@@ -26,7 +26,7 @@ entry:
 define i8 @test_add_i8() {
 ; COMMON-LABEL: test_add_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_add_1
 entry:
@@ -36,7 +36,7 @@ entry:
 define i8 @test_sub_i8() {
 ; COMMON-LABEL: test_sub_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_sub_1
 entry:
@@ -46,7 +46,7 @@ entry:
 define i8 @test_and_i8() {
 ; COMMON-LABEL: test_and_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_and_1
 entry:
@@ -56,7 +56,7 @@ entry:
 define i8 @test_nand_i8() {
 ; COMMON-LABEL: test_nand_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_nand_1
 entry:
@@ -66,7 +66,7 @@ entry:
 define i8 @test_or_i8() {
 ; COMMON-LABEL: test_or_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_or_1
 entry:
@@ -76,7 +76,7 @@ entry:
 define i8 @test_xor_i8() {
 ; COMMON-LABEL: test_xor_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_xor_1
 entry:
@@ -86,7 +86,7 @@ entry:
 define i8 @test_max_i8() {
 ; COMMON-LABEL: test_max_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_max_1
 entry:
@@ -96,7 +96,7 @@ entry:
 define i8 @test_min_i8() {
 ; COMMON-LABEL: test_min_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_min_1
 entry:
@@ -106,7 +106,7 @@ entry:
 define i8 @test_umax_i8() {
 ; COMMON-LABEL: test_umax_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_umax_1
 entry:
@@ -116,7 +116,7 @@ entry:
 define i8 @test_umin_i8() {
 ; COMMON-LABEL: test_umin_i8:
 ; EXPAND32: ldrexb
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexb
 ; THUMB1: bl __sync_fetch_and_umin_1
 entry:
@@ -128,7 +128,7 @@ entry:
 define i16 @test_xchg_i16() {
 ; COMMON-LABEL: test_xchg_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_lock_test_and_set_2
 entry:
@@ -138,7 +138,7 @@ entry:
 define i16 @test_add_i16() {
 ; COMMON-LABEL: test_add_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_add_2
 entry:
@@ -148,7 +148,7 @@ entry:
 define i16 @test_sub_i16() {
 ; COMMON-LABEL: test_sub_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_sub_2
 entry:
@@ -158,7 +158,7 @@ entry:
 define i16 @test_and_i16() {
 ; COMMON-LABEL: test_and_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_and_2
 entry:
@@ -168,7 +168,7 @@ entry:
 define i16 @test_nand_i16() {
 ; COMMON-LABEL: test_nand_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_nand_2
 entry:
@@ -178,7 +178,7 @@ entry:
 define i16 @test_or_i16() {
 ; COMMON-LABEL: test_or_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_or_2
 entry:
@@ -188,7 +188,7 @@ entry:
 define i16 @test_xor_i16() {
 ; COMMON-LABEL: test_xor_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_xor_2
 entry:
@@ -198,7 +198,7 @@ entry:
 define i16 @test_max_i16() {
 ; COMMON-LABEL: test_max_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_max_2
 entry:
@@ -208,7 +208,7 @@ entry:
 define i16 @test_min_i16() {
 ; COMMON-LABEL: test_min_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_min_2
 entry:
@@ -218,7 +218,7 @@ entry:
 define i16 @test_umax_i16() {
 ; COMMON-LABEL: test_umax_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_umax_2
 entry:
@@ -228,7 +228,7 @@ entry:
 define i16 @test_umin_i16() {
 ; COMMON-LABEL: test_umin_i16:
 ; EXPAND32: ldrexh
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strexh
 ; THUMB1: bl __sync_fetch_and_umin_2
 entry:
@@ -240,7 +240,7 @@ entry:
 define i32 @test_xchg_i32() {
 ; COMMON-LABEL: test_xchg_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_lock_test_and_set_4
 entry:
@@ -250,7 +250,7 @@ entry:
 define i32 @test_add_i32() {
 ; COMMON-LABEL: test_add_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_add_4
 entry:
@@ -260,7 +260,7 @@ entry:
 define i32 @test_sub_i32() {
 ; COMMON-LABEL: test_sub_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_sub_4
 entry:
@@ -270,7 +270,7 @@ entry:
 define i32 @test_and_i32() {
 ; COMMON-LABEL: test_and_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_and_4
 entry:
@@ -280,7 +280,7 @@ entry:
 define i32 @test_nand_i32() {
 ; COMMON-LABEL: test_nand_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_nand_4
 entry:
@@ -290,7 +290,7 @@ entry:
 define i32 @test_or_i32() {
 ; COMMON-LABEL: test_or_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_or_4
 entry:
@@ -300,7 +300,7 @@ entry:
 define i32 @test_xor_i32() {
 ; COMMON-LABEL: test_xor_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_xor_4
 entry:
@@ -310,7 +310,7 @@ entry:
 define i32 @test_max_i32() {
 ; COMMON-LABEL: test_max_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_max_4
 entry:
@@ -320,7 +320,7 @@ entry:
 define i32 @test_min_i32() {
 ; COMMON-LABEL: test_min_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 
 ; THUMB1: bl __sync_fetch_and_min_4
@@ -331,7 +331,7 @@ entry:
 define i32 @test_umax_i32() {
 ; COMMON-LABEL: test_umax_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_umax_4
 entry:
@@ -341,7 +341,7 @@ entry:
 define i32 @test_umin_i32() {
 ; COMMON-LABEL: test_umin_i32:
 ; EXPAND32: ldrex
-; EXPAND32: str
+; EXPAND32-NOT: str
 ; EXPAND32: strex
 ; THUMB1: bl __sync_fetch_and_umin_4
 entry:
@@ -352,10 +352,10 @@ entry:
 define i64 @test_xchg_i64() {
 ; COMMON-LABEL: test_xchg_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_lock_test_and_set_8
-; BASELINE64: bl __sync_lock_test_and_set_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw xchg i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -363,10 +363,10 @@ entry:
 define i64 @test_add_i64() {
 ; COMMON-LABEL: test_add_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_add_8
-; BASELINE64: bl __sync_fetch_and_add_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw add i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -374,10 +374,10 @@ entry:
 define i64 @test_sub_i64() {
 ; COMMON-LABEL: test_sub_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_sub_8
-; BASELINE64: bl __sync_fetch_and_sub_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw sub i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -385,10 +385,10 @@ entry:
 define i64 @test_and_i64() {
 ; COMMON-LABEL: test_and_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_and_8
-; BASELINE64: bl __sync_fetch_and_and_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw and i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -396,10 +396,10 @@ entry:
 define i64 @test_nand_i64() {
 ; COMMON-LABEL: test_nand_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_nand_8
-; BASELINE64: bl __sync_fetch_and_nand_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw nand i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -407,10 +407,10 @@ entry:
 define i64 @test_or_i64() {
 ; COMMON-LABEL: test_or_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_or_8
-; BASELINE64: bl __sync_fetch_and_or_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw or i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -418,10 +418,10 @@ entry:
 define i64 @test_xor_i64() {
 ; COMMON-LABEL: test_xor_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_xor_8
-; BASELINE64: bl __sync_fetch_and_xor_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw xor i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -430,10 +430,10 @@ entry:
 define i64 @test_max_i64() {
 ; COMMON-LABEL: test_max_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_max_8
-; BASELINE64: bl __sync_fetch_and_max_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw max i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -441,10 +441,10 @@ entry:
 define i64 @test_min_i64() {
 ; COMMON-LABEL: test_min_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_min_8
-; BASELINE64: bl __sync_fetch_and_min_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw min i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -452,10 +452,10 @@ entry:
 define i64 @test_umax_i64() {
 ; COMMON-LABEL: test_umax_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_umax_8
-; BASELINE64: bl __sync_fetch_and_umax_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw umax i64* @atomic_i64, i64 1 monotonic
   ret i64 %0
@@ -463,10 +463,10 @@ entry:
 define i64 @test_umin_i64() {
 ; COMMON-LABEL: test_umin_i64:
 ; EXPAND64: ldrexd
-; EXPAND64: str
+; EXPAND64-NOT: str
 ; EXPAND64: strexd
 ; THUMB1: bl __sync_fetch_and_umin_8
-; BASELINE64: bl __sync_fetch_and_umin_8
+; BASELINE64: bl __sync_val_compare_and_swap_8
 entry:
   %0 = atomicrmw umin i64* @atomic_i64, i64 1 monotonic
   ret i64 %0