[PATCH] D23253: [X86] Generalized transformation of `definstr gr8; movzx gr32, gr8` to `xor gr32, gr32; definstr gr8`

Sun Aug 7 21:52:29 PDT 2016

bryant added a comment.

I would also like to note that because this pass works by register
re-allocation, it _never_ "pessimizes" the way https://reviews.llvm.org/D21774 does. For instance, under
`-march=x86` (diff of assembler output of
`test/CodeGen/X86/sse42-intrinsics-x86.ll`, with "-" indicating https://reviews.llvm.org/D21774 and "+"
indicating this pass):

   test_x86_sse42_pcmpestria128:
  -# Throughput: 2.25; Uops: 8; Latency: 7; Size: 36
  +# Throughput: 1.55; Uops: 6; Latency: 3; Size: 33
  -    pushl    %ebx
       movl    $7, %eax
       movl    $7, %edx
  -    xorl    %ebx, %ebx
       pcmpestri    $7, %xmm1, %xmm0
  -    seta    %bl
  -    movl    %ebx, %eax
  -    popl    %ebx
  +    seta    %al
  +    movzbl    %al, %eax
   retl

Or perhaps less obviously (from cmpxchg-i1.ll under x86-64):

   cmpxchg_zext:
  -# Throughput: 2.05; Uops: 6; Latency: 10; Size: 24
  +# Throughput: 1.85; Uops: 7; Latency: 10; Size: 23
  -    xorl    %ecx, %ecx
       movl    %esi, %eax
       lock        cmpxchgl    %edx, (%rdi)
  -    sete    %cl
  -    movl    %ecx, %eax
  +    sete    %al
  +    movzbl    %al, %eax
   retq

On the other hand, because it matches on any gr8-defining instruction (not just
setccs), it can do things like (from pre-ra-sched.ll under x86-64):

   test:
  -# Throughput: 11.05; Uops: 27; Latency: 14; Size: 77
  +# Throughput: 6.55; Uops: 24; Latency: 14; Size: 77
       movzbl    1(%rdi), %r9d
  -    movb    2(%rdi), %al
  -    xorb    %r9b, %al
  +    xorl    %r10d, %r10d
  +    movb    2(%rdi), %r10b
  +    xorb    %r9b, %r10b
       movzbl    3(%rdi), %esi
  -    movb    4(%rdi), %cl
  -    xorb    %sil, %cl
  +    xorl    %eax, %eax
  +    movb    4(%rdi), %al
  +    xorb    %sil, %al
       movzbl    5(%rdi), %r8d
  -    movb    6(%rdi), %dl
  -    xorb    %r8b, %dl
  +    xorl    %ecx, %ecx
  +    movb    6(%rdi), %cl
  +    xorb    %r8b, %cl
       cmpb    $0, (%rdi)
  -    movzbl    %al, %edi
  -    cmovnel    %r9d, %edi
  -    movzbl    %cl, %eax
  +    cmovnel    %r9d, %r10d
       cmovnel    %esi, %eax
  -    movzbl    %dl, %ecx
       cmovnel    %r8d, %ecx
       testl    %r9d, %r9d
  -    cmovnel    %edi, %eax
  +    cmovnel    %r10d, %eax
       testl    %esi, %esi
       cmovel    %ecx, %eax
       retq

which, if IACA is to be believed, is 40% ++throughput for free.

Also, it should be noted that this differential depends on the zip iterator
introduced in https://reviews.llvm.org/D23252 .

Repository:
  rL LLVM

https://reviews.llvm.org/D23253