[LLVMdev] Replacing Platform Specific IR Codes with Generic Implementation and Introducing Macro Facilities

Sat May 10 11:35:46 PDT 2014

On 10 May 2014, at 19:25, Andrew Trick <atrick at apple.com> wrote:

> The IR is missing a weak variant of cmpxchg. But is there anything else missing at IR level? My understanding was that LLVM’s atomic memory ordering constraints are complete, but that codegen is not highly optimized, and maybe conservative for some targets. Which idiom do you have trouble with on non-x86?

The example from our EuroLLVM talk was this:

_Atomic(int) a; a *= b;

This is (according to the spec) equivalent to this (simplified slightly):

    int expected = a;
    int desired;
    do {
      desired = expected * b;
    } while (!compare_swap_weak(current, expected, desired));

What clang generates is almost this, but with a strong compare and swap:

define void @mul(i32* %a, i32 %b) #0 {
entry:
 %atomic-load = load atomic i32* %a seq_cst, align 4, !tbaa !1
 br label %atomic_op

atomic_op:                                        ; preds = %atomic_op, %entry
 %0 = phi i32 [ %atomic-load, %entry ], [ %1, %atomic_op ]
 %mul = mul nsw i32 %0, %b
 %1 = cmpxchg i32* %a, i32 %0, i32 %mul seq_cst
 %2 = icmp eq i32 %1, %0
 br i1 %2, label %atomic_cont, label %atomic_op

atomic_cont:                                      ; preds = %atomic_op
 ret void
}

This  maps trivially to x86:

LBB0_1:
	movl	%ecx, %edx
	imull	%esi, %edx
	movl	%ecx, %eax
	lock
	cmpxchgl	%edx, (%rdi)
	cmpl	%ecx, %eax
	movl	%eax, %ecx
	jne	LBB0_1

For MIPS, what we *should* be generating is:

	sync 0            # Ensure all other loads / stores are globally visible
retry:
	ll   $t4, $a0     # Load the current value of the atomic int
	mult $t4, $a1     # Multiply by the other argument
	mflo $t4          # Get the result
	sc   $t4, $a0     # Try to write it back atomically
	bnez $t4, entry   # If we failed, try the whole thing again
	sync 0            # branch delay slot - ensure seqcst behaviour here

What we actually generate is this:

# BB#0:                                 # %entry
	daddiu	$sp, $sp, -16
	sd	$fp, 8($sp)             # 8-byte Folded Spill
	move	 $fp, $sp
	addiu	$3, $zero, 0
$BB0_1:                                 # %entry
                                       # =>This Inner Loop Header: Depth=1
	ll	$2, 0($4)
	bne	$2, $3, $BB0_3
	nop
# BB#2:                                 # %entry
                                       #   in Loop: Header=BB0_1 Depth=1
	addiu	$6, $zero, 0
	sc	$6, 0($4)
	beqz	$6, $BB0_1
	nop
$BB0_3:                                 # %entry
	sync 0
$BB0_4:                                 # %atomic_op
                                       # =>This Loop Header: Depth=1
                                       #     Child Loop BB0_5 Depth 2
	move	 $3, $2
	mul	$6, $3, $5
	sync 0
$BB0_5:                                 # %atomic_op
                                       #   Parent Loop BB0_4 Depth=1
                                       # =>  This Inner Loop Header: Depth=2
	ll	$2, 0($4)
	bne	$2, $3, $BB0_7
	nop
# BB#6:                                 # %atomic_op
                                       #   in Loop: Header=BB0_5 Depth=2
	move	 $7, $6
	sc	$7, 0($4)
	beqz	$7, $BB0_5
	nop
$BB0_7:                                 # %atomic_op
                                       #   in Loop: Header=BB0_4 Depth=1
	sync 0
	bne	$2, $3, $BB0_4
	nop
# BB#8:                                 # %atomic_cont
	move	 $sp, $fp
	ld	$fp, 8($sp)             # 8-byte Folded Reload
	jr	$ra
	daddiu	$sp, $sp, 16

For correctness, we *have* to implement the cmpxchg in the IR as a ll/sc loop, and so we end up with a nested loop for something that is a single line in the source.

The idiom of the weak compare and exchange loop is a fairly common one, but we generate spectacularly bad code for it.

David