[llvm-bugs] [Bug 32710] New: PMADDWD optimization on unsigned 16 bit integer

via llvm-bugs llvm-bugs at lists.llvm.org
Wed Apr 19 05:50:49 PDT 2017


https://bugs.llvm.org/show_bug.cgi?id=32710

            Bug ID: 32710
           Summary: PMADDWD optimization on unsigned 16 bit integer
           Product: new-bugs
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: ilia.taraban at intel.com
                CC: llvm-bugs at lists.llvm.org

==============test.c==================
#include "stdio.h"
#define N 32832 

unsigned short a[N];

unsigned int sq_sum(void) {
  int i;
  unsigned int acc = 0;
  for (i = 0; i < N; i++){
     acc += a[i] * a[i];
  }
  return acc;
}

int 
main() {
  int i;
  for (i = 0; i < N; i++) {
       a[i]   = i;     
  }

  unsigned int acc;

  acc = sq_sum();
  printf("%u\n", acc);
  return 0;
}


======================================

>>>clang -v
clang version 5.0.0 (trunk 299776)
...

>>>clang -o simple.exe  -O2 test.c


>>>clang -o test.exe   -O2 -march=skylake-avx512 test.c

>>>./speed.exe
2458652000
>>>./test.exe
2194410848

======================================
Disas of test.exe(299775):
...
  400920:       62 f2 7d 48 33 a0 c0    vpmovzxwd 0x6120c0(%rax),%zmm4
  400927:       20 61 00
  40092a:       62 f2 7d 48 33 a8 e0    vpmovzxwd 0x6120e0(%rax),%zmm5
  400931:       20 61 00
  400934:       62 f2 7d 48 33 b0 00    vpmovzxwd 0x612100(%rax),%zmm6
  40093b:       21 61 00
  40093e:       62 f2 7d 48 33 b8 20    vpmovzxwd 0x612120(%rax),%zmm7
  400945:       21 61 00
  400948:       62 f2 5d 48 40 e4       vpmulld %zmm4,%zmm4,%zmm4
  40094e:       62 f2 55 48 40 ed       vpmulld %zmm5,%zmm5,%zmm5
  400954:       62 f2 4d 48 40 f6       vpmulld %zmm6,%zmm6,%zmm6
  40095a:       62 f2 45 48 40 ff       vpmulld %zmm7,%zmm7,%zmm7
  400960:       62 f1 5d 48 fe c0       vpaddd %zmm0,%zmm4,%zmm0
  400966:       62 f1 55 48 fe c9       vpaddd %zmm1,%zmm5,%zmm1
  40096c:       62 f1 4d 48 fe d2       vpaddd %zmm2,%zmm6,%zmm2
  400972:       62 f1 45 48 fe db       vpaddd %zmm3,%zmm7,%zmm3
...

Disas of test.exe(299776):
...
  400950:       c5 fe 6f a8 c0 20 61    vmovdqu 0x6120c0(%rax),%ymm5
  400957:       00
  400958:       c5 fe 6f b0 e0 20 61    vmovdqu 0x6120e0(%rax),%ymm6
  40095f:       00
  400960:       c5 fe 6f b8 00 21 61    vmovdqu 0x612100(%rax),%ymm7
  400967:       00
  400968:       c5 7e 6f 80 20 21 61    vmovdqu 0x612120(%rax),%ymm8
  40096f:       00
  400970:       c5 d5 f5 ed             vpmaddwd %ymm5,%ymm5,%ymm5
  400974:       62 f3 55 48 3a e8 01    vinserti32x8 $0x1,%ymm0,%zmm5,%zmm5
  40097b:       62 f1 55 48 fe c9       vpaddd %zmm1,%zmm5,%zmm1
...

but in vpmaddwd integers are multiplied as signed integers, so we receive wrong
answer when arguments are bigger than SHRT_MAX.

===============r299776================
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp  (revision 299775)
+++ lib/Target/X86/X86ISelLowering.cpp  (revision 299776)
@@ -34618,6 +34618,51 @@
                      DAG.getConstant(0, DL, VT), NewCmp);
 }

+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  SDValue MulOp = N->getOperand(0);
+  SDValue Phi = N->getOperand(1);
+
+  if (MulOp.getOpcode() != ISD::MUL)
+    std::swap(MulOp, Phi);
+  if (MulOp.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  ShrinkMode Mode;
+  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  unsigned RegSize = 128;
+  if (Subtarget.hasBWI())
+    RegSize = 512;
+  else if (Subtarget.hasAVX2())
+    RegSize = 256;
+  unsigned VectorSize = VT.getVectorNumElements() * 16;
+  // If the vector size is less than 128, or greater than the supported
RegSize,
+  // do not use PMADD.
+  if (VectorSize < 128 || VectorSize > RegSize)
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                   VT.getVectorNumElements());
+  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                VT.getVectorNumElements() / 2);
+
+  // Shrink the operands of mul.
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,
MulOp->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,
MulOp->getOperand(1));
+
+  // Madd vector size is half of the original vector size
+  SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+  // Fill the rest of the output with 0
+  SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+  return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
+}
+
 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDLoc DL(N);
@@ -34695,6 +34740,8 @@
   if (Flags->hasVectorReduction()) {
     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
       return Sad;
+    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+      return MAdd;
   }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
======================================================
This optimization can't be used with MULU16, so may be solution is adding:

--- lib/Target/X86/X86ISelLowering.cpp  (revision 300686)
+++ lib/Target/X86/X86ISelLowering.cpp  (working copy)
@@ -34631,7 +34631,7 @@
     return SDValue();

   ShrinkMode Mode;
-  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
     return SDValue();

------------------------
Intel Software Engineer
Ilia Taraban

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170419/9ee69d2b/attachment-0001.html>


More information about the llvm-bugs mailing list