[PATCH] D109368: [LV] Vectorize cases with larger number of RT checks, execute only if profitable.

Thu Jul 7 09:48:18 PDT 2022

alexfh added a comment.

In D109368#3636261 <https://reviews.llvm.org/D109368#3636261>, @asmok-g wrote:

> Heads-up: I think this patch caused a mis-compile that's causing some test in Tenserflow <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/tests/tensor_array_ops_test.py> to fail. We're still confirming it and working on a reproducer.

I got it down to this sample:

  void *memmove(void * destination, const void * source, unsigned long num);
  void f(char *s, char *d, int g) {
    while (g--) {
      memmove(d, s, 4);
      d += 4;
    }
  }

When compiled with `--target=x86_64--linux-gnu -O2`, before and after this commit, the resulting assembly differs in a way that seems wrong to me:

  @@ -1,100 +1,100 @@
          .text
          .file   "input.i"
          .globl  f                               # -- Begin function f
          .p2align        4, 0x90
          .type   f, at function
   f:                                      # @f
          .cfi_startproc
   # %bb.0:
                                           # kill: def $edx killed $edx def $rdx
          testl   %edx, %edx
          je      .LBB0_16
   # %bb.1:
          leal    -1(%rdx), %r8d
  -       cmpl    $7, %r8d
  +       cmpl    $15, %r8d
          jb      .LBB0_2
   # %bb.3:
          leaq    4(%rdi), %rax
          cmpq    %rsi, %rax
          jbe     .LBB0_6
   # %bb.4:
          leaq    (%rsi,%r8,4), %rax
          addq    $4, %rax
          cmpq    %rdi, %rax
          jbe     .LBB0_6
   .LBB0_2:
          movq    %rsi, %rax
   .LBB0_9:
          leal    -1(%rdx), %r8d
          testb   $7, %dl
          je      .LBB0_13
   # %bb.10:
          movl    %edx, %r9d
          andl    $7, %r9d
          xorl    %esi, %esi
          .p2align        4, 0x90
   .LBB0_11:                               # =>This Inner Loop Header: Depth=1
          movl    (%rdi), %ecx
          movl    %ecx, (%rax)
          addq    $4, %rax
          incq    %rsi
          cmpl    %esi, %r9d
          jne     .LBB0_11
   # %bb.12:
          subl    %esi, %edx
   .LBB0_13:
          cmpl    $7, %r8d
          jb      .LBB0_16
   # %bb.14:
          movl    %edx, %ecx
          xorl    %edx, %edx
          .p2align        4, 0x90
   .LBB0_15:                               # =>This Inner Loop Header: Depth=1
          movl    (%rdi), %esi
          movl    %esi, (%rax,%rdx,4)
          movl    (%rdi), %esi
          movl    %esi, 4(%rax,%rdx,4)
          movl    (%rdi), %esi
          movl    %esi, 8(%rax,%rdx,4)
          movl    (%rdi), %esi
          movl    %esi, 12(%rax,%rdx,4)
          movl    (%rdi), %esi
          movl    %esi, 16(%rax,%rdx,4)
          movl    (%rdi), %esi
          movl    %esi, 20(%rax,%rdx,4)
          movl    (%rdi), %esi
          movl    %esi, 24(%rax,%rdx,4)
          movl    (%rdi), %esi
          movl    %esi, 28(%rax,%rdx,4)
          addq    $8, %rdx
          cmpl    %edx, %ecx
          jne     .LBB0_15
          jmp     .LBB0_16
   .LBB0_6:
          incq    %r8
          movq    %r8, %r9
          andq    $-8, %r9
          subl    %r9d, %edx
          leaq    (%rsi,%r9,4), %rax
          movd    (%rdi), %xmm0                   # xmm0 = mem[0],zero,zero,zero
          pshufd  $0, %xmm0, %xmm0                # xmm0 = xmm0[0,0,0,0]
          xorl    %ecx, %ecx
          .p2align        4, 0x90
   .LBB0_7:                                # =>This Inner Loop Header: Depth=1
          movdqu  %xmm0, (%rsi,%rcx,4)
          movdqu  %xmm0, 16(%rsi,%rcx,4)
          addq    $8, %rcx
          cmpq    %rcx, %r9
          jne     .LBB0_7
   # %bb.8:
          cmpq    %r9, %r8
          jne     .LBB0_9
   .LBB0_16:
          retq
   .Lfunc_end0:
          .size   f, .Lfunc_end0-f
          .cfi_endproc
                                           # -- End function
  -       .ident  "clang version google3-trunk (aa78c5298ea37f2ca8150dc0a1c880be7ec438f4)"
  +       .ident  "clang version google3-trunk (644a965c1efef68f22d9495e4cefbb599c214788)"
          .section        ".note.GNU-stack","", at progbits
          .addrsig

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D109368/new/

https://reviews.llvm.org/D109368