<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - AArch64 compare_exchange_weak leads to a conditional branch on a constant (and very poor branch layout)"

   href="https://bugs.llvm.org/show_bug.cgi?id=38173">38173</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>AArch64 compare_exchange_weak leads to a conditional branch on a constant (and very poor branch layout)

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Keywords</th>

          <td>performance

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>peter@cordes.ca

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>This code compile to some really dumb asm for AArch64.  (And moderately dumb

for x86-64).

#include <atomic>

#include <cstdint>

uint64_t CAS_retry(std::atomic<uint64_t>& foo)

{

    auto expected = foo.load(std::memory_order_relaxed);

    while(!foo.compare_exchange_weak(

        expected, expected + 2,

        std::memory_order_seq_cst, std::memory_order_relaxed))

        {}

    return expected;

}

On Godbolt, with clang6.0 or trunk(337113)

 -std=gnu++11 -O3 -Wall -target aarch64-linux-android -stdlib=libc++ we get

   CAS_retry(std::__1::atomic<unsigned long long>&):

        ldr     x8, [x0]              # it doesn't optimize into a LL/SC retry

loop, but that's expected

          # first iteration of the CAS retry loop is peeled

        ldaxr   x9, [x0]

        cmp     x9, x8

        b.ne    .LBB0_3

        add     x10, x8, #2             // =2

        stlxr   w11, x10, [x0]

        cbnz    w11, .LBB0_4

        orr     w10, wzr, #0x1        ###### w10 = 0|1 = 1

        tbz     w10, #0, .LBB0_5      ###### then branch on the low bit: always

not-taken

        b       .LBB0_10              ###### jump to epilogue

.LBB0_3:

        clrex

.LBB0_4:

        mov     w10, wzr            ###### w10 = 0

        tbnz    w10, #0, .LBB0_10   ###### if(w10[0]!=0) always falls through

.LBB0_5:             # The main CAS retry loop is duplicated.

        mov     x8, x9

        ldaxr   x9, [x0]

        cmp     x9, x8

        b.ne    .LBB0_8

        add     x10, x8, #2             // =2

        stlxr   w11, x10, [x0]

        cbnz    w11, .LBB0_9

        orr     w10, wzr, #0x1

        cbz     w10, .LBB0_5           #### Again branching on a constant

        b       .LBB0_10

.LBB0_8:                                //   in Loop: Header=BB0_5 Depth=1

        clrex

.LBB0_9:                                //   in Loop: Header=BB0_5 Depth=1

        mov     w10, wzr

        cbz     w10, .LBB0_5

.LBB0_10:

        mov     x0, x8

        ret

See

<a href="https://stackoverflow.com/questions/51346815/atomically-clearing-lowest-non-zero-bit-of-an-unsigned-integer/51353947#51353947">https://stackoverflow.com/questions/51346815/atomically-clearing-lowest-non-zero-bit-of-an-unsigned-integer/51353947#51353947</a>

for a more detailed commentary of the version you get from  

while(unlikely(!cas(...))) {}.

This hand-written version is what we *should* emit, if we still fail to

optimize a compare_exchange_weak loop into a LL/SC loop (if that's even legal).

 *This* version is definitely legal; it makes all the same memory accesses as

the source, and as clang's current code, but without being ridiculous.

    ldr     x9, [x0]                   @ x9 = expected = foo.load(relaxed)

 .Lcas_retry:

    ldaxr   x8, [x0]                   @ x8 = the CAS load (a=acquire,

x=exclusive: pairs with a later stxr)

    cmp     x8, x9                   @ compare part of the CAS

    b.ne    .Lcas_fail

    add     x10, x9, #2

    stlxr   w11, x10, [x0]           @ the CAS store (sequential-release)

    cbnz    w11, .Lllsc_fail

    # LL/SC success falls through

#.Lepilogue:

    mov     x0, x8

    ret

.Lcas_fail:

    clrex                           @ clear the ldrx/stxr transaction

.Lllsc_fail:

    mov     x9, x8                  @ update expected

    b     .Lcas_retry           @ instead of duplicating the loop, jump back to

the existing one with x9 = expected

No taken branches in the fast path (the no-contention case)

Instead of peeling the first iteration of the while() loop, I just jump over

the function epilogue.  LL/SC fail is slow anyway, and unlikely, so putting the

setup for the next iteration out-of-line makes a lot of sense.  (Especially if

the source uses an unlikely() __builtin_expect() macro.

--------

x86: gcc does this, which looks optimal:

CAS_retry(std::atomic<unsigned long>&):

        mov     rax, QWORD PTR [rdi]

.L2:

        lea     rdx, [rax+2]

        lock cmpxchg    QWORD PTR [rdi], rdx

        jne     .L2

        ret

clang does this, with a bunch of extra MOV instructions, and peeling the loop

just requires a taken branch on the fast path.

CAS_retry(std::atomic<unsigned long>&): 

        mov     rcx, qword ptr [rdi]

        lea     rdx, [rcx + 2]

        mov     rax, rcx

        lock            cmpxchg qword ptr [rdi], rdx

        je      .LBB0_3

.LBB0_1:                                # =>This Inner Loop Header: Depth=1

        mov     rcx, rax         #### This is especially suspect

        lea     rdx, [rcx + 2]   # this could have used rax as a source

        mov     rax, rcx         # rax = rcx from the previous wasted mov

        lock            cmpxchg qword ptr [rdi], rdx

        jne     .LBB0_1

.LBB0_3:

        mov     rax, rcx       # cmpxchg leaves expected in RAX on success

        ret

I don't think there's likely to be a branch-prediction performance advantage

from peeling the first iteration.  I guess there could be in the general case

of other loops whose iteration counts don't depend on thread timing, though.  A

loop-or-not branch separate from the main loop branch might predict better for

patterns of 0 iterations vs. 30 iterations where there's some pattern to when

it should fall through the first time.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>