[PATCH] D11382: x86 atomic: optimize a.store(reg op a.load(acquire), release)
JF Bastien
jfb at chromium.org
Thu Jul 23 10:08:13 PDT 2015
jfb added a comment.
Actually, `*sd` instructions are SSE2, so I fixed the pattern matcher :)
To be specific, the x86-32 code with `-mattr=+sse` generates the following:
fadd_32r: # @fadd_32r
# ...
movl 12(%esp), %eax
movl (%eax), %ecx
movl %ecx, (%esp)
movss (%esp), %xmm0 # xmm0 = mem[0],zero,zero,zero
addss 16(%esp), %xmm0
movss %xmm0, 4(%esp)
movl 4(%esp), %ecx
movl %ecx, (%eax)
addl $8, %esp
retl
# ...
fadd_64r: # @fadd_64r
# ...
movl 32(%esp), %esi
xorl %eax, %eax
xorl %edx, %edx
xorl %ebx, %ebx
xorl %ecx, %ecx
lock cmpxchg8b (%esi)
movl %edx, 12(%esp)
movl %eax, 8(%esp)
fldl 8(%esp)
faddl 36(%esp)
fstpl (%esp)
movl (%esp), %ebx
movl 4(%esp), %ecx
movl (%esi), %eax
movl 4(%esi), %edx
# ...
Part of the problem is calling convention on x86-32, and part of the problem is x87 for 64-bit.
With `-mattr=+sse,+sse2` the generated code becomes:
fadd_32r: # @fadd_32r
# ...
movss 8(%esp), %xmm0 # xmm0 = mem[0],zero,zero,zero
movl 4(%esp), %eax
addss (%eax), %xmm0
movss %xmm0, (%eax)
retl
# ...
fadd_64r: # @fadd_64r
# ...
movl 32(%esp), %esi
xorl %eax, %eax
xorl %edx, %edx
xorl %ebx, %ebx
xorl %ecx, %ecx
lock cmpxchg8b (%esi)
movl %edx, 12(%esp)
movl %eax, 8(%esp)
movsd 8(%esp), %xmm0 # xmm0 = mem[0],zero
addsd 36(%esp), %xmm0
movsd %xmm0, (%esp)
movl (%esp), %ebx
movl 4(%esp), %ecx
movl (%esi), %eax
movl 4(%esi), %edx
I could do something with this, but I'm a bit wary of how the calling convention works out (or rather, that the code generation won't change slightly and throw things off).
http://reviews.llvm.org/D11382
More information about the llvm-commits
mailing list