<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - __m256i (a+b*2)+b (vpmullw;vpaddw) is slower than (b+a*2)+a (3 vpaddw)"

   href="https://bugs.llvm.org/show_bug.cgi?id=48078">48078</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>__m256i (a+b*2)+b (vpmullw;vpaddw) is slower than (b+a*2)+a (3 vpaddw)

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>i@maskray.me

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, pengfei.wang@intel.com, spatel+llvm@rotateright.com

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Discovered by JP Maaninen

#include <immintrin.h>

__m256i Slow(__m256i a, __m256i b) {

  __m256i c = _mm256_add_epi16(a, _mm256_slli_epi16(a, 1)); 

  return _mm256_add_epi16(c, b);

}

       vpmullw .LCPI0_0(%rip), %ymm0, %ymm0

        vpaddw  %ymm1, %ymm0, %ymm0

        retq

__m256i Fast(__m256i a, __m256i b) {

  __m256i c = _mm256_add_epi16(b, _mm256_slli_epi16(a, 1)); 

  return _mm256_add_epi16(c, a);

}

        vpaddw  %ymm0, %ymm0, %ymm2

        vpaddw  %ymm0, %ymm1, %ymm0

        vpaddw  %ymm2, %ymm0, %ymm0

retq

----

This is either instcombine's problem or the backend's lack of optimization.

define dso_local <4 x i64> @_Z4SlowDv4_xS_(<4 x i64> %a, <4 x i64> %b)

local_unnamed_addr #0 {

entry:

  %0 = bitcast <4 x i64> %a to <16 x i16>

  %1 = shl <16 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16

1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>

  %add.i5 = add <16 x i16> %1, %0

  %2 = bitcast <16 x i16> %add.i5 to <4 x i64>

  %3 = bitcast <4 x i64> %b to <16 x i16>

  %add.i = add <16 x i16> %3, %add.i5

  %4 = bitcast <16 x i16> %add.i to <4 x i64>

  ret <4 x i64> %4

}

attributes #0 = { norecurse nounwind readnone uwtable

"disable-tail-calls"="false" "frame-pointer"="none"

"less-precise-fpmad"="false" "min-legal-vector-width"="256"

"no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"

"no-signed-zeros-fp-math"="false" "no-trapping-math"="true"

"stack-protector-buffer-size"="8" "target-cpu"="haswell"

"target-features"="+avx,+avx2,+bmi,+bmi2,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"

"unsafe-fp-math"="false" "use-soft-float"="false" }

opt -passes=instcombine -S generates

  %add.i5 = mul <16 x i16> %0, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16

3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>

which will lower to VPMULLWYrm in X86ISelDAGToDAG and sticks after every

codegen pass.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>