[PATCH] D11382: x86 atomic: optimize a.store(reg op a.load(acquire), release)

Thu Jul 23 10:08:13 PDT 2015

jfb added a comment.

Actually, `*sd` instructions are SSE2, so I fixed the pattern matcher :)

To be specific, the x86-32 code with `-mattr=+sse` generates the following:

  fadd_32r:                               # @fadd_32r
  	# ...
  	movl	12(%esp), %eax
  	movl	(%eax), %ecx
  	movl	%ecx, (%esp)
  	movss	(%esp), %xmm0           # xmm0 = mem[0],zero,zero,zero
  	addss	16(%esp), %xmm0
  	movss	%xmm0, 4(%esp)
  	movl	4(%esp), %ecx
  	movl	%ecx, (%eax)
  	addl	$8, %esp
  	retl
  	# ...
  fadd_64r:                               # @fadd_64r
  	# ...
  	movl	32(%esp), %esi
  	xorl	%eax, %eax
  	xorl	%edx, %edx
  	xorl	%ebx, %ebx
  	xorl	%ecx, %ecx
  	lock		cmpxchg8b	(%esi)
  	movl	%edx, 12(%esp)
  	movl	%eax, 8(%esp)
  	fldl	8(%esp)
  	faddl	36(%esp)
  	fstpl	(%esp)
  	movl	(%esp), %ebx
  	movl	4(%esp), %ecx
  	movl	(%esi), %eax
  	movl	4(%esi), %edx
  	# ...

Part of the problem is calling convention on x86-32, and part of the problem is x87 for 64-bit.

With `-mattr=+sse,+sse2` the generated code becomes:

  fadd_32r:                               # @fadd_32r
  	# ...
  	movss	8(%esp), %xmm0          # xmm0 = mem[0],zero,zero,zero
  	movl	4(%esp), %eax
  	addss	(%eax), %xmm0
  	movss	%xmm0, (%eax)
  	retl
  	# ...
  fadd_64r:                               # @fadd_64r
  	# ...
  	movl	32(%esp), %esi
  	xorl	%eax, %eax
  	xorl	%edx, %edx
  	xorl	%ebx, %ebx
  	xorl	%ecx, %ecx
  	lock		cmpxchg8b	(%esi)
  	movl	%edx, 12(%esp)
  	movl	%eax, 8(%esp)
  	movsd	8(%esp), %xmm0          # xmm0 = mem[0],zero
  	addsd	36(%esp), %xmm0
  	movsd	%xmm0, (%esp)
  	movl	(%esp), %ebx
  	movl	4(%esp), %ecx
  	movl	(%esi), %eax
  	movl	4(%esi), %edx

I could do something with this, but I'm a bit wary of how the calling convention works out (or rather, that the code generation won't change slightly and throw things off).

http://reviews.llvm.org/D11382