[llvm] Reland "Revert "AtomicExpand: Allow incrementally legalizing atomicrmw"" (PR #106793)

Vitaly Buka via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 30 13:54:51 PDT 2024


https://github.com/vitalybuka updated https://github.com/llvm/llvm-project/pull/106793

>From 09d9461e69293b86e96899707ba29b1a357e75cd Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at gmail.com>
Date: Fri, 30 Aug 2024 13:52:37 -0700
Subject: [PATCH 1/3] =?UTF-8?q?Revert=20"Revert=20"AtomicExpand:=20Allow?=
 =?UTF-8?q?=20incrementally=20legalizing=20atomicrmw"=20(#106=E2=80=A6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 982d2445f2a5bad96c501ff23923648ffa094ef2.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp       |  35 +-
 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++++---------
 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++++---------
 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++++---------
 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++++---------
 5 files changed, 836 insertions(+), 691 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 39a705599f90cc..b9732e816ea7e6 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -351,17 +351,30 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
 
   bool MadeChange = false;
 
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (Instruction &I : instructions(F))
-    if (I.isAtomic() && !isa<FenceInst>(&I))
-      AtomicInsts.push_back(&I);
-
-  for (auto *I : AtomicInsts) {
-    if (processAtomicInstr(I))
-      MadeChange = true;
+  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) {
+    BasicBlock *BB = &*BBI;
+    ++BBI;
+
+    BasicBlock::iterator Next;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         I = Next) {
+      Instruction &Inst = *I;
+      Next = std::next(I);
+
+      if (processAtomicInstr(&Inst)) {
+        MadeChange = true;
+
+        // Detect control flow change and resume iteration from the original
+        // block to inspect any newly inserted blocks. This allows incremental
+        // legalizaton of atomicrmw and cmpxchg.
+        if (BB != Next->getParent()) {
+          BBI = BB->getIterator();
+          BBE = F.end();
+          break;
+        }
+      }
+    }
   }
 
   return MadeChange;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index 0d230bb9dcc6e9..ed9c1b037d0cc7 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -43,46 +43,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -128,46 +131,49 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2
@@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4
@@ -399,35 +413,38 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4
@@ -469,36 +486,40 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl __adddf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8
@@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; SOFTFP-NOLSE-NEXT:    b .LBB9_1
-; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index bfe0d20ca814bc..888b795876f7df 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -45,46 +45,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -130,46 +133,49 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2
@@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4
@@ -401,35 +415,38 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4
@@ -471,36 +488,40 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl fmax
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8
@@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index 6b7d2df044460a..a3665c6e428608 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -45,46 +45,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -130,46 +133,49 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -234,36 +240,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2
@@ -329,36 +339,40 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4
@@ -401,35 +415,38 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4
@@ -471,36 +488,40 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl fmin
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8
@@ -567,18 +588,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -591,29 +612,33 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
-; SOFTFP-NOLSE-NEXT:    b .LBB6_1
-; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
+; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -723,17 +748,18 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -743,25 +769,28 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -809,45 +838,49 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index 67e164037d5ce7..7725ce0e731859 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -43,46 +43,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; SOFTFP-NOLSE-NEXT:    b .LBB0_1
-; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
+; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -128,46 +131,49 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
-; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
 ; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; SOFTFP-NOLSE-NEXT:    b .LBB1_1
-; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
+; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -232,36 +238,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; SOFTFP-NOLSE-NEXT:    b .LBB2_1
-; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
+; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2
@@ -327,36 +337,40 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
+; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; SOFTFP-NOLSE-NEXT:    b .LBB3_1
-; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
+; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4
@@ -399,35 +413,38 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
-; SOFTFP-NOLSE-NEXT:    mov w20, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    mov w21, w0
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w0, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; SOFTFP-NOLSE-NEXT:    b .LBB4_1
-; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
+; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4
@@ -469,36 +486,40 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x21, x1
+; SOFTFP-NOLSE-NEXT:    mov x20, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
-; SOFTFP-NOLSE-NEXT:    mov x20, x8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
+; SOFTFP-NOLSE-NEXT:    mov w9, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
-; SOFTFP-NOLSE-NEXT:    mov x1, x21
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
+; SOFTFP-NOLSE-NEXT:    mov x1, x20
 ; SOFTFP-NOLSE-NEXT:    bl __subdf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    cmp x8, x21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; SOFTFP-NOLSE-NEXT:    b .LBB5_1
-; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x8
+; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
+; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8
@@ -687,18 +708,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w22, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
-; SOFTFP-NOLSE-NEXT:    mov w21, w8
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -711,29 +732,33 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:    mov w8, w22
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; SOFTFP-NOLSE-NEXT:    b .LBB7_1
-; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
+; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -799,17 +824,18 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -819,25 +845,28 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w22, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; SOFTFP-NOLSE-NEXT:    b .LBB8_1
-; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w21
+; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
+; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -885,45 +914,49 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
-; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
-; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w23
-; SOFTFP-NOLSE-NEXT:    mov w9, w0
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w0
+; SOFTFP-NOLSE-NEXT:    mov w9, w22
+; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x22, x9
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
-; SOFTFP-NOLSE-NEXT:    b .LBB9_1
-; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
-; SOFTFP-NOLSE-NEXT:    mov w1, w22
+; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
+; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
+; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
+; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
+; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
+; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload

>From 4370dd4bb5d3527e8ae5348b3c71d335d2feeb4f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Fri, 30 Aug 2024 13:53:29 -0700
Subject: [PATCH 2/3] Fix use after free

---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index b9732e816ea7e6..d9843554860d63 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -368,7 +368,7 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
         // Detect control flow change and resume iteration from the original
         // block to inspect any newly inserted blocks. This allows incremental
         // legalizaton of atomicrmw and cmpxchg.
-        if (BB != Next->getParent()) {
+        if (Next == E || BB != Next->getParent()) {
           BBI = BB->getIterator();
           BBE = F.end();
           break;

>From db21593e493caf86e8fce7d68d1c6b199a3795c1 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Fri, 30 Aug 2024 13:53:56 -0700
Subject: [PATCH 3/3] fix typo

---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d9843554860d63..2da723a0cc175b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -367,7 +367,7 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
 
         // Detect control flow change and resume iteration from the original
         // block to inspect any newly inserted blocks. This allows incremental
-        // legalizaton of atomicrmw and cmpxchg.
+        // legalization of atomicrmw and cmpxchg.
         if (Next == E || BB != Next->getParent()) {
           BBI = BB->getIterator();
           BBE = F.end();



More information about the llvm-commits mailing list