<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Loop vectorizer produces extremely poor code for pattern fills"

   href="https://bugs.llvm.org/show_bug.cgi?id=37423">37423</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Loop vectorizer produces extremely poor code for pattern fills

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>6.0

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Loop Optimizer

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>fabiang@radgametools.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>void patternFill(int *arr, int count)

{

    for (int i = 0; i < count; i++)

        arr[i] = (i & 1) ? 456 : 123;

}

With clang 6.0 release, "clang -O2" on x86-64, this turns into a lot of code,

with the inner loop being

.LBB0_8: # =>This Inner Loop Header: Depth=1

  movdqa %xmm1, %xmm7

  pand %xmm4, %xmm7

  movdqa %xmm2, %xmm0

  pand %xmm4, %xmm0

  pcmpeqd %xmm3, %xmm0

  pshufd $177, %xmm0, %xmm5 # xmm5 = xmm0[1,0,3,2]

  pand %xmm0, %xmm5

  pcmpeqd %xmm3, %xmm7

  pshufd $177, %xmm7, %xmm0 # xmm0 = xmm7[1,0,3,2]

  pand %xmm7, %xmm0

  shufps $136, %xmm0, %xmm5 # xmm5 = xmm5[0,2],xmm0[0,2]

  movaps %xmm5, %xmm0

  andnps %xmm8, %xmm0

  andps %xmm9, %xmm5

  orps %xmm0, %xmm5

  movups %xmm5, (%rdi,%rsi,4)

  movups %xmm5, 16(%rdi,%rsi,4)

  movups %xmm5, 32(%rdi,%rsi,4)

  movups %xmm5, 48(%rdi,%rsi,4)

  addq $16, %rsi

  paddq %xmm6, %xmm2

  paddq %xmm6, %xmm1

  addq $4, %rax

  jne .LBB0_8

by comparison, compiling with "-fno-vectorize" results in the much better (in

terms of both code size and expected execution time)

  movaps .LCPI0_0(%rip), %xmm0 # xmm0 = [123,456,123,456]

.LBB0_8: # =>This Inner Loop Header: Depth=1

  movups %xmm0, (%rdi,%rcx,4)

  addq $4, %rcx

  cmpq %rcx, %rdx

  jne .LBB0_8</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>