<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - [Regression] Suboptimal loop exit codegen"
   href="https://bugs.llvm.org/show_bug.cgi?id=49296">49296</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>[Regression] Suboptimal loop exit codegen
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Loop Optimizer
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>david.bolvansky@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>#include <immintrin.h>
#include <stdint.h>

typedef uint32_t u32v8 __attribute__((vector_size(32)));
void double_load(int8_t *__restrict out, const int8_t *__restrict input)
{
    u32v8 *vin = (u32v8 *)input;
    u32v8 *vout = (u32v8 *)out;
    for (unsigned i=0 ; i<1024 ; i+=32){
        u32v8 in = *vin++;
        *vout++ = in | (in >> 4);
    }
}


Flags: -O3 -mavx2 -fno-unroll-loops

LLVM 10 and newer:
double_load(signed char*, signed char const*):                   #
@double_load(signed char*, signed char const*)
        xorl    %eax, %eax
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vmovdqa (%rsi,%rax), %ymm0
        vpsrld  $4, %ymm0, %ymm1
        vpor    %ymm0, %ymm1, %ymm0
        vmovdqa %ymm0, (%rdi,%rax)
        addq    $32, %rax
        leal    -32(%rax), %ecx
        cmpl    $992, %ecx              # imm = 0x3E0
        jb      .LBB0_1
        vzeroupper
        retq


LLVM 9:
double_load(signed char*, signed char const*):                   #
@double_load(signed char*, signed char const*)
        xorl    %eax, %eax
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vmovdqa (%rsi,%rax), %ymm0
        vpsrld  $4, %ymm0, %ymm1
        vpor    %ymm0, %ymm1, %ymm0
        vmovdqa %ymm0, (%rdi,%rax)
        addq    $32, %rax
        cmpl    $1024, %eax             # imm = 0x400
        jb      .LBB0_1
        vzeroupper
        retq

LLVM IR comparison:
LLVM 10+:
  %15 = add nuw nsw i32 %8, 32
  %16 = icmp ult i32 %8, 992

LLVM 9:
  %15 = add nuw nsw i32 %8, 32
  %16 = icmp ult i32 %15, 1024



Codegen: <a href="https://godbolt.org/z/GzaTPj">https://godbolt.org/z/GzaTPj</a></pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>