<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - Suboptimal assembly generated for -Os."
   href="https://bugs.llvm.org/show_bug.cgi?id=39213">39213</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Suboptimal assembly generated for -Os.
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>new-bugs
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>7.0
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>new bugs
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>mariusz@podlesny.eu
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>While playing around with templates I stumbled upon the following issue (the
code below is non-template - I simplified it): the assembly generated for
store2 function compiled with -Os seems to be "suboptimal".

COMMAND:
clang++ -Os -g0 -DNDEBUG -Werror -Wall -Wextra -pedantic  -std=c++17 -g0 -S
main.cpp

CODE:
#include <cstddef>
#include <cstdint>

void store1(uint16_t value, uint8_t* storage)
{
    storage[0] = value;
    storage[1] = value >> 8;
}

void store2(uint16_t value, uint8_t* storage)
{
    for (size_t i = 0; i < sizeof(value); ++i)
        storage[i] = value >> (8* i);
}

RESULT:
_Z6store1tPh:                           # @_Z6store1tPh
        .cfi_startproc
# %bb.0:
        movl    %edi, %eax
        movb    %al, (%rsi)
        movb    %ah, 1(%rsi)
        retq
.Lfunc_end0:
        .size   _Z6store1tPh, .Lfunc_end0-_Z6store1tPh
        .cfi_endproc

_Z6store2tPh:                           # @_Z6store2tPh
        .cfi_startproc
# %bb.0:
        movd    %edi, %xmm1
        movl    $1, %eax
        movq    %rax, %xmm0
        pslldq  $8, %xmm0               # xmm0 =
zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
        xorl    %eax, %eax
        pshufd  $68, %xmm1, %xmm1       # xmm1 = xmm1[0,1,0,1]
        movdqa  .LCPI1_0(%rip), %xmm2   # xmm2 = [4294967295,0,4294967295,0]
        pand    %xmm2, %xmm1
        movapd  .LCPI1_1(%rip), %xmm3   # xmm3 = [255,255]
        movdqa  .LCPI1_2(%rip), %xmm4   # xmm4 = [2,2]
        movl    $2, %ecx
.LBB1_1:                                # =>This Inner Loop Header: Depth=1
        movdqa  %xmm0, %xmm5
        psllq   $3, %xmm5
        pand    %xmm2, %xmm5
        movdqa  %xmm1, %xmm6
        psrlq   %xmm5, %xmm6
        pshufd  $78, %xmm5, %xmm5       # xmm5 = xmm5[2,3,0,1]
        movdqa  %xmm1, %xmm7
        psrlq   %xmm5, %xmm7
        movsd   %xmm6, %xmm7            # xmm7 = xmm6[0],xmm7[1]
        andpd   %xmm3, %xmm7
        packuswb        %xmm7, %xmm7
        packuswb        %xmm7, %xmm7
        packuswb        %xmm7, %xmm7
        movd    %xmm7, %edx
        movw    %dx, (%rsi,%rax)
        addq    $2, %rax
        paddq   %xmm4, %xmm0
        cmpq    %rcx, %rax
        jne     .LBB1_1
# %bb.2:
        retq
.Lfunc_end1:
        .size   _Z6store2tPh, .Lfunc_end1-_Z6store2tPh
        .cfi_endproc

Well, the result surprised me quite a bit as I expected "store2" function to be
unrolled to something similar as "store1" or better yet, changed into code
which gcc 8 generates:

store2(unsigned short, unsigned char*):
        mov     WORD PTR [rsi], di
        ret

The result occurs only in -Os, whereas O1, O2, O3, Oz and Og produce much more
"regular" assembly. It also seems to show up in clang 6, 7 and trunk but not in
clang 5 and previous ones (checked on Compiler Explorer).</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>