<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - Missing optimisation: Type conversion not vectorised in simple additive reduction"
   href="https://bugs.llvm.org/show_bug.cgi?id=32077">32077</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Missing optimisation: Type conversion not vectorised in simple additive reduction
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>new-bugs
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>new bugs
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>drraph@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Consider:

double f(double x[]) {
  float p = 1.0;
  for (int i = 0; i < 16; i++)
    p += x[i];
  return p;
}

clang/llvm with -O3 -march=core-avx2 -ffast-math gives:

.LCPI0_0:
        .quad   4607182418800017408     # double 1
f:                                      # @f
        vmovsd  xmm0, qword ptr [rdi]   # xmm0 = mem[0],zero
        vaddsd  xmm0, xmm0, qword ptr [rip + .LCPI0_0]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 8]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 16]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 24]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 32]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 40]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 48]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 56]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 64]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 72]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 80]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 88]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 96]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 104]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 112]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        vaddsd  xmm0, xmm0, qword ptr [rdi + 120]
        vcvtsd2ss       xmm0, xmm0, xmm0
        vcvtss2sd       xmm0, xmm0, xmm0
        ret


However more efficient would be:

f:
        vcvtpd2ps xmm0, YMMWORD PTR [rdi]                       #4.5
        vcvtpd2ps xmm1, YMMWORD PTR [32+rdi]                    #4.5
        vcvtpd2ps xmm2, YMMWORD PTR [64+rdi]                    #4.5
        vcvtpd2ps xmm3, YMMWORD PTR [96+rdi]                    #4.5
        vaddps    xmm4, xmm0, xmm1                              #2.11
        vaddps    xmm5, xmm2, xmm3                              #2.11
        vaddps    xmm6, xmm4, xmm5                              #2.11
        vmovhlps  xmm7, xmm6, xmm6                              #2.11
        vaddps    xmm8, xmm6, xmm7                              #2.11
        vshufps   xmm9, xmm8, xmm8, 245                         #2.11
        vaddss    xmm10, xmm8, xmm9                             #2.11
        vaddss    xmm0, xmm10, DWORD PTR .L_2il0floatpacket.0[rip] #2.11
        vcvtss2sd xmm0, xmm0, xmm0                              #5.10
        vzeroupper                                              #5.10
        ret                                                     #5.10
.L_2il0floatpacket.0:
        .long   0x3f800000</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>