[llvm] 3a00e58 - AArch64: use indivisible cmpxchg for 128-bit atomic loads at O0

Wed Sep 22 06:20:46 PDT 2021

Author: Tim Northover
Date: 2021-09-22T14:20:43+01:00
New Revision: 3a00e58c2fca0c20d3792c897ef1ea54b6a168a0

URL: https://github.com/llvm/llvm-project/commit/3a00e58c2fca0c20d3792c897ef1ea54b6a168a0
DIFF: https://github.com/llvm/llvm-project/commit/3a00e58c2fca0c20d3792c897ef1ea54b6a168a0.diff

LOG: AArch64: use indivisible cmpxchg for 128-bit atomic loads at O0

Like normal atomicrmw operations, at -O0 the simple register-allocator can
insert spills into the LL/SC loop if it's expanded and visible when regalloc
runs. This can cause the operation to never succeed by repeatedly clearing the
monitor. Instead expand to a cmpxchg, which has a pseudo-instruction for -O0.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 186a954e75039..9aee359c57205 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17670,6 +17670,14 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   if (Size != 128 || isOpSuitableForLDPSTP(LI))
     return AtomicExpansionKind::None;
 
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement atomicrmw without spilling. If the target address is also on the
+  // stack and close enough to the spill slot, this can lead to a situation
+  // where the monitor always gets cleared and the atomic operation can never
+  // succeed. So at -O0 lower this operation to a CAS loop.
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    return AtomicExpansionKind::CmpXChg;
+
   return AtomicExpansionKind::LLSC;
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index ecf197009407d..5b81be232ec03 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -373,60 +373,42 @@ define void @atomic_load_relaxed(i64, i64, i128* %p, i128* %p2) {
 ;
 ; CHECK-LLSC-O0-LABEL: atomic_load_relaxed:
 ; CHECK-LLSC-O0:       // %bb.0:
-; CHECK-LLSC-O0-NEXT:    sub sp, sp, #48
-; CHECK-LLSC-O0-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-LLSC-O0-NEXT:    str x2, [sp, #32] // 8-byte Folded Spill
-; CHECK-LLSC-O0-NEXT:    str x3, [sp, #40] // 8-byte Folded Spill
-; CHECK-LLSC-O0-NEXT:    b .LBB4_1
-; CHECK-LLSC-O0-NEXT:  .LBB4_1: // %atomicrmw.start
-; CHECK-LLSC-O0-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-LLSC-O0-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
-; CHECK-LLSC-O0-NEXT:    ldxp x9, x10, [x11]
-; CHECK-LLSC-O0-NEXT:    mov x8, xzr
-; CHECK-LLSC-O0-NEXT:    orr x9, x9, x8
-; CHECK-LLSC-O0-NEXT:    orr x10, x8, x10
+; CHECK-LLSC-O0-NEXT:    mov x11, xzr
+; CHECK-LLSC-O0-NEXT:  .LBB4_1: // =>This Inner Loop Header: Depth=1
+; CHECK-LLSC-O0-NEXT:    ldxp x9, x8, [x2]
+; CHECK-LLSC-O0-NEXT:    cmp x9, x11
+; CHECK-LLSC-O0-NEXT:    cset w10, ne
+; CHECK-LLSC-O0-NEXT:    cmp x8, x11
+; CHECK-LLSC-O0-NEXT:    cinc w10, w10, ne
+; CHECK-LLSC-O0-NEXT:    cbnz w10, .LBB4_3
+; CHECK-LLSC-O0-NEXT:  // %bb.2: // in Loop: Header=BB4_1 Depth=1
+; CHECK-LLSC-O0-NEXT:    stxp w10, x11, x11, [x2]
+; CHECK-LLSC-O0-NEXT:    cbnz w10, .LBB4_1
+; CHECK-LLSC-O0-NEXT:    b .LBB4_4
+; CHECK-LLSC-O0-NEXT:  .LBB4_3: // in Loop: Header=BB4_1 Depth=1
+; CHECK-LLSC-O0-NEXT:    stxp w10, x9, x8, [x2]
+; CHECK-LLSC-O0-NEXT:    cbnz w10, .LBB4_1
+; CHECK-LLSC-O0-NEXT:  .LBB4_4:
 ; CHECK-LLSC-O0-NEXT:    // implicit-def: $q0
 ; CHECK-LLSC-O0-NEXT:    mov v0.d[0], x9
-; CHECK-LLSC-O0-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-LLSC-O0-NEXT:    mov v0.d[1], x10
-; CHECK-LLSC-O0-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-LLSC-O0-NEXT:    stxp w8, x9, x10, [x11]
-; CHECK-LLSC-O0-NEXT:    cbnz w8, .LBB4_1
-; CHECK-LLSC-O0-NEXT:    b .LBB4_2
-; CHECK-LLSC-O0-NEXT:  .LBB4_2: // %atomicrmw.end
-; CHECK-LLSC-O0-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-LLSC-O0-NEXT:    ldr x8, [sp, #40] // 8-byte Folded Reload
-; CHECK-LLSC-O0-NEXT:    str q0, [x8]
-; CHECK-LLSC-O0-NEXT:    add sp, sp, #48
+; CHECK-LLSC-O0-NEXT:    mov v0.d[1], x8
+; CHECK-LLSC-O0-NEXT:    str q0, [x3]
 ; CHECK-LLSC-O0-NEXT:    ret
 ;
 ; CHECK-CAS-O0-LABEL: atomic_load_relaxed:
 ; CHECK-CAS-O0:       // %bb.0:
-; CHECK-CAS-O0-NEXT:    sub sp, sp, #48
-; CHECK-CAS-O0-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-CAS-O0-NEXT:    str x2, [sp, #32] // 8-byte Folded Spill
-; CHECK-CAS-O0-NEXT:    str x3, [sp, #40] // 8-byte Folded Spill
-; CHECK-CAS-O0-NEXT:    b .LBB4_1
-; CHECK-CAS-O0-NEXT:  .LBB4_1: // %atomicrmw.start
-; CHECK-CAS-O0-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-CAS-O0-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
-; CHECK-CAS-O0-NEXT:    ldxp x9, x10, [x11]
 ; CHECK-CAS-O0-NEXT:    mov x8, xzr
-; CHECK-CAS-O0-NEXT:    orr x9, x9, x8
-; CHECK-CAS-O0-NEXT:    orr x10, x8, x10
+; CHECK-CAS-O0-NEXT:    mov x0, x8
+; CHECK-CAS-O0-NEXT:    mov x1, x8
+; CHECK-CAS-O0-NEXT:    mov x4, x8
+; CHECK-CAS-O0-NEXT:    mov x5, x8
+; CHECK-CAS-O0-NEXT:    casp x0, x1, x4, x5, [x2]
+; CHECK-CAS-O0-NEXT:    mov x9, x0
+; CHECK-CAS-O0-NEXT:    mov x8, x1
 ; CHECK-CAS-O0-NEXT:    // implicit-def: $q0
 ; CHECK-CAS-O0-NEXT:    mov v0.d[0], x9
-; CHECK-CAS-O0-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-CAS-O0-NEXT:    mov v0.d[1], x10
-; CHECK-CAS-O0-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-CAS-O0-NEXT:    stxp w8, x9, x10, [x11]
-; CHECK-CAS-O0-NEXT:    cbnz w8, .LBB4_1
-; CHECK-CAS-O0-NEXT:    b .LBB4_2
-; CHECK-CAS-O0-NEXT:  .LBB4_2: // %atomicrmw.end
-; CHECK-CAS-O0-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-CAS-O0-NEXT:    ldr x8, [sp, #40] // 8-byte Folded Reload
-; CHECK-CAS-O0-NEXT:    str q0, [x8]
-; CHECK-CAS-O0-NEXT:    add sp, sp, #48
+; CHECK-CAS-O0-NEXT:    mov v0.d[1], x8
+; CHECK-CAS-O0-NEXT:    str q0, [x3]
 ; CHECK-CAS-O0-NEXT:    ret
 
     %r = load atomic i128, i128* %p monotonic, align 16