[llvm-bugs] [Bug 32710] New: PMADDWD optimization on unsigned 16 bit integer
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Apr 19 05:50:49 PDT 2017
https://bugs.llvm.org/show_bug.cgi?id=32710
Bug ID: 32710
Summary: PMADDWD optimization on unsigned 16 bit integer
Product: new-bugs
Version: trunk
Hardware: PC
OS: All
Status: NEW
Severity: enhancement
Priority: P
Component: new bugs
Assignee: unassignedbugs at nondot.org
Reporter: ilia.taraban at intel.com
CC: llvm-bugs at lists.llvm.org
==============test.c==================
#include "stdio.h"
#define N 32832
unsigned short a[N];
unsigned int sq_sum(void) {
int i;
unsigned int acc = 0;
for (i = 0; i < N; i++){
acc += a[i] * a[i];
}
return acc;
}
int
main() {
int i;
for (i = 0; i < N; i++) {
a[i] = i;
}
unsigned int acc;
acc = sq_sum();
printf("%u\n", acc);
return 0;
}
======================================
>>>clang -v
clang version 5.0.0 (trunk 299776)
...
>>>clang -o simple.exe -O2 test.c
>>>clang -o test.exe -O2 -march=skylake-avx512 test.c
>>>./speed.exe
2458652000
>>>./test.exe
2194410848
======================================
Disas of test.exe(299775):
...
400920: 62 f2 7d 48 33 a0 c0 vpmovzxwd 0x6120c0(%rax),%zmm4
400927: 20 61 00
40092a: 62 f2 7d 48 33 a8 e0 vpmovzxwd 0x6120e0(%rax),%zmm5
400931: 20 61 00
400934: 62 f2 7d 48 33 b0 00 vpmovzxwd 0x612100(%rax),%zmm6
40093b: 21 61 00
40093e: 62 f2 7d 48 33 b8 20 vpmovzxwd 0x612120(%rax),%zmm7
400945: 21 61 00
400948: 62 f2 5d 48 40 e4 vpmulld %zmm4,%zmm4,%zmm4
40094e: 62 f2 55 48 40 ed vpmulld %zmm5,%zmm5,%zmm5
400954: 62 f2 4d 48 40 f6 vpmulld %zmm6,%zmm6,%zmm6
40095a: 62 f2 45 48 40 ff vpmulld %zmm7,%zmm7,%zmm7
400960: 62 f1 5d 48 fe c0 vpaddd %zmm0,%zmm4,%zmm0
400966: 62 f1 55 48 fe c9 vpaddd %zmm1,%zmm5,%zmm1
40096c: 62 f1 4d 48 fe d2 vpaddd %zmm2,%zmm6,%zmm2
400972: 62 f1 45 48 fe db vpaddd %zmm3,%zmm7,%zmm3
...
Disas of test.exe(299776):
...
400950: c5 fe 6f a8 c0 20 61 vmovdqu 0x6120c0(%rax),%ymm5
400957: 00
400958: c5 fe 6f b0 e0 20 61 vmovdqu 0x6120e0(%rax),%ymm6
40095f: 00
400960: c5 fe 6f b8 00 21 61 vmovdqu 0x612100(%rax),%ymm7
400967: 00
400968: c5 7e 6f 80 20 21 61 vmovdqu 0x612120(%rax),%ymm8
40096f: 00
400970: c5 d5 f5 ed vpmaddwd %ymm5,%ymm5,%ymm5
400974: 62 f3 55 48 3a e8 01 vinserti32x8 $0x1,%ymm0,%zmm5,%zmm5
40097b: 62 f1 55 48 fe c9 vpaddd %zmm1,%zmm5,%zmm1
...
but in vpmaddwd integers are multiplied as signed integers, so we receive wrong
answer when arguments are bigger than SHRT_MAX.
===============r299776================
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp (revision 299775)
+++ lib/Target/X86/X86ISelLowering.cpp (revision 299776)
@@ -34618,6 +34618,51 @@
DAG.getConstant(0, DL, VT), NewCmp);
}
+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue MulOp = N->getOperand(0);
+ SDValue Phi = N->getOperand(1);
+
+ if (MulOp.getOpcode() != ISD::MUL)
+ std::swap(MulOp, Phi);
+ if (MulOp.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+ unsigned VectorSize = VT.getVectorNumElements() * 16;
+ // If the vector size is less than 128, or greater than the supported
RegSize,
+ // do not use PMADD.
+ if (VectorSize < 128 || VectorSize > RegSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements());
+ EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements() / 2);
+
+ // Shrink the operands of mul.
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,
MulOp->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,
MulOp->getOperand(1));
+
+ // Madd vector size is half of the original vector size
+ SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+ // Fill the rest of the output with 0
+ SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+ return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
+}
+
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
@@ -34695,6 +34740,8 @@
if (Flags->hasVectorReduction()) {
if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
return Sad;
+ if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+ return MAdd;
}
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
======================================================
This optimization can't be used with MULU16, so may be solution is adding:
--- lib/Target/X86/X86ISelLowering.cpp (revision 300686)
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
@@ -34631,7 +34631,7 @@
return SDValue();
ShrinkMode Mode;
- if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+ if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
return SDValue();
------------------------
Intel Software Engineer
Ilia Taraban
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170419/9ee69d2b/attachment-0001.html>
More information about the llvm-bugs
mailing list