<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - PMADDWD optimization on unsigned 16 bit integer"

   href="https://bugs.llvm.org/show_bug.cgi?id=32710">32710</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>PMADDWD optimization on unsigned 16 bit integer

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>ilia.taraban@intel.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>==============test.c==================

#include "stdio.h"

#define N 32832 

unsigned short a[N];

unsigned int sq_sum(void) {

  int i;

  unsigned int acc = 0;

  for (i = 0; i < N; i++){

     acc += a[i] * a[i];

  }

  return acc;

}

int 

main() {

  int i;

  for (i = 0; i < N; i++) {

       a[i]   = i;     

  }

  unsigned int acc;

  acc = sq_sum();

  printf("%u\n", acc);

  return 0;

}

======================================

<span class="quote">>>>clang -v</span >

clang version 5.0.0 (trunk 299776)

...

<span class="quote">>>>clang -o simple.exe  -O2 test.c</span >

<span class="quote">>>>clang -o test.exe   -O2 -march=skylake-avx512 test.c</span >

<span class="quote">>>>./speed.exe</span >

2458652000

<span class="quote">>>>./test.exe</span >

2194410848

======================================

Disas of test.exe(299775):

...

  400920:       62 f2 7d 48 33 a0 c0    vpmovzxwd 0x6120c0(%rax),%zmm4

  400927:       20 61 00

  40092a:       62 f2 7d 48 33 a8 e0    vpmovzxwd 0x6120e0(%rax),%zmm5

  400931:       20 61 00

  400934:       62 f2 7d 48 33 b0 00    vpmovzxwd 0x612100(%rax),%zmm6

  40093b:       21 61 00

  40093e:       62 f2 7d 48 33 b8 20    vpmovzxwd 0x612120(%rax),%zmm7

  400945:       21 61 00

  400948:       62 f2 5d 48 40 e4       vpmulld %zmm4,%zmm4,%zmm4

  40094e:       62 f2 55 48 40 ed       vpmulld %zmm5,%zmm5,%zmm5

  400954:       62 f2 4d 48 40 f6       vpmulld %zmm6,%zmm6,%zmm6

  40095a:       62 f2 45 48 40 ff       vpmulld %zmm7,%zmm7,%zmm7

  400960:       62 f1 5d 48 fe c0       vpaddd %zmm0,%zmm4,%zmm0

  400966:       62 f1 55 48 fe c9       vpaddd %zmm1,%zmm5,%zmm1

  40096c:       62 f1 4d 48 fe d2       vpaddd %zmm2,%zmm6,%zmm2

  400972:       62 f1 45 48 fe db       vpaddd %zmm3,%zmm7,%zmm3

...

Disas of test.exe(299776):

...

  400950:       c5 fe 6f a8 c0 20 61    vmovdqu 0x6120c0(%rax),%ymm5

  400957:       00

  400958:       c5 fe 6f b0 e0 20 61    vmovdqu 0x6120e0(%rax),%ymm6

  40095f:       00

  400960:       c5 fe 6f b8 00 21 61    vmovdqu 0x612100(%rax),%ymm7

  400967:       00

  400968:       c5 7e 6f 80 20 21 61    vmovdqu 0x612120(%rax),%ymm8

  40096f:       00

  400970:       c5 d5 f5 ed             vpmaddwd %ymm5,%ymm5,%ymm5

  400974:       62 f3 55 48 3a e8 01    vinserti32x8 $0x1,%ymm0,%zmm5,%zmm5

  40097b:       62 f1 55 48 fe c9       vpaddd %zmm1,%zmm5,%zmm1

...

but in vpmaddwd integers are multiplied as signed integers, so we receive wrong

answer when arguments are bigger than SHRT_MAX.

===============r299776================

Index: lib/Target/X86/X86ISelLowering.cpp

===================================================================

--- lib/Target/X86/X86ISelLowering.cpp  (revision 299775)

+++ lib/Target/X86/X86ISelLowering.cpp  (revision 299776)

@@ -34618,6 +34618,51 @@

                      DAG.getConstant(0, DL, VT), NewCmp);

 }

+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,

+                                      const X86Subtarget &Subtarget) {

+  SDValue MulOp = N->getOperand(0);

+  SDValue Phi = N->getOperand(1);

+

+  if (MulOp.getOpcode() != ISD::MUL)

+    std::swap(MulOp, Phi);

+  if (MulOp.getOpcode() != ISD::MUL)

+    return SDValue();

+

+  ShrinkMode Mode;

+  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))

+    return SDValue();

+

+  EVT VT = N->getValueType(0);

+

+  unsigned RegSize = 128;

+  if (Subtarget.hasBWI())

+    RegSize = 512;

+  else if (Subtarget.hasAVX2())

+    RegSize = 256;

+  unsigned VectorSize = VT.getVectorNumElements() * 16;

+  // If the vector size is less than 128, or greater than the supported

RegSize,

+  // do not use PMADD.

+  if (VectorSize < 128 || VectorSize > RegSize)

+    return SDValue();

+

+  SDLoc DL(N);

+  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

+                                   VT.getVectorNumElements());

+  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

+                                VT.getVectorNumElements() / 2);

+

+  // Shrink the operands of mul.

+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,

MulOp->getOperand(0));

+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,

MulOp->getOperand(1));

+

+  // Madd vector size is half of the original vector size

+  SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);

+  // Fill the rest of the output with 0

+  SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);

+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);

+  return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);

+}

+

 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,

                                      const X86Subtarget &Subtarget) {

   SDLoc DL(N);

@@ -34695,6 +34740,8 @@

   if (Flags->hasVectorReduction()) {

     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))

       return Sad;

+    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))

+      return MAdd;

   }

   EVT VT = N->getValueType(0);

   SDValue Op0 = N->getOperand(0);

======================================================

This optimization can't be used with MULU16, so may be solution is adding:

--- lib/Target/X86/X86ISelLowering.cpp  (revision 300686)

+++ lib/Target/X86/X86ISelLowering.cpp  (working copy)

@@ -34631,7 +34631,7 @@

     return SDValue();

   ShrinkMode Mode;

-  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))

+  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)

     return SDValue();

------------------------

Intel Software Engineer

Ilia Taraban</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>