<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - PMADDWD optimization on unsigned 16 bit integer"
href="https://bugs.llvm.org/show_bug.cgi?id=32710">32710</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>PMADDWD optimization on unsigned 16 bit integer
</td>
</tr>
<tr>
<th>Product</th>
<td>new-bugs
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>All
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>new bugs
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>ilia.taraban@intel.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>==============test.c==================
#include "stdio.h"
#define N 32832
unsigned short a[N];
unsigned int sq_sum(void) {
int i;
unsigned int acc = 0;
for (i = 0; i < N; i++){
acc += a[i] * a[i];
}
return acc;
}
int
main() {
int i;
for (i = 0; i < N; i++) {
a[i] = i;
}
unsigned int acc;
acc = sq_sum();
printf("%u\n", acc);
return 0;
}
======================================
<span class="quote">>>>clang -v</span >
clang version 5.0.0 (trunk 299776)
...
<span class="quote">>>>clang -o simple.exe -O2 test.c</span >
<span class="quote">>>>clang -o test.exe -O2 -march=skylake-avx512 test.c</span >
<span class="quote">>>>./speed.exe</span >
2458652000
<span class="quote">>>>./test.exe</span >
2194410848
======================================
Disas of test.exe(299775):
...
400920: 62 f2 7d 48 33 a0 c0 vpmovzxwd 0x6120c0(%rax),%zmm4
400927: 20 61 00
40092a: 62 f2 7d 48 33 a8 e0 vpmovzxwd 0x6120e0(%rax),%zmm5
400931: 20 61 00
400934: 62 f2 7d 48 33 b0 00 vpmovzxwd 0x612100(%rax),%zmm6
40093b: 21 61 00
40093e: 62 f2 7d 48 33 b8 20 vpmovzxwd 0x612120(%rax),%zmm7
400945: 21 61 00
400948: 62 f2 5d 48 40 e4 vpmulld %zmm4,%zmm4,%zmm4
40094e: 62 f2 55 48 40 ed vpmulld %zmm5,%zmm5,%zmm5
400954: 62 f2 4d 48 40 f6 vpmulld %zmm6,%zmm6,%zmm6
40095a: 62 f2 45 48 40 ff vpmulld %zmm7,%zmm7,%zmm7
400960: 62 f1 5d 48 fe c0 vpaddd %zmm0,%zmm4,%zmm0
400966: 62 f1 55 48 fe c9 vpaddd %zmm1,%zmm5,%zmm1
40096c: 62 f1 4d 48 fe d2 vpaddd %zmm2,%zmm6,%zmm2
400972: 62 f1 45 48 fe db vpaddd %zmm3,%zmm7,%zmm3
...
Disas of test.exe(299776):
...
400950: c5 fe 6f a8 c0 20 61 vmovdqu 0x6120c0(%rax),%ymm5
400957: 00
400958: c5 fe 6f b0 e0 20 61 vmovdqu 0x6120e0(%rax),%ymm6
40095f: 00
400960: c5 fe 6f b8 00 21 61 vmovdqu 0x612100(%rax),%ymm7
400967: 00
400968: c5 7e 6f 80 20 21 61 vmovdqu 0x612120(%rax),%ymm8
40096f: 00
400970: c5 d5 f5 ed vpmaddwd %ymm5,%ymm5,%ymm5
400974: 62 f3 55 48 3a e8 01 vinserti32x8 $0x1,%ymm0,%zmm5,%zmm5
40097b: 62 f1 55 48 fe c9 vpaddd %zmm1,%zmm5,%zmm1
...
but in vpmaddwd integers are multiplied as signed integers, so we receive wrong
answer when arguments are bigger than SHRT_MAX.
===============r299776================
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp (revision 299775)
+++ lib/Target/X86/X86ISelLowering.cpp (revision 299776)
@@ -34618,6 +34618,51 @@
DAG.getConstant(0, DL, VT), NewCmp);
}
+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue MulOp = N->getOperand(0);
+ SDValue Phi = N->getOperand(1);
+
+ if (MulOp.getOpcode() != ISD::MUL)
+ std::swap(MulOp, Phi);
+ if (MulOp.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+ unsigned VectorSize = VT.getVectorNumElements() * 16;
+ // If the vector size is less than 128, or greater than the supported
RegSize,
+ // do not use PMADD.
+ if (VectorSize < 128 || VectorSize > RegSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements());
+ EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements() / 2);
+
+ // Shrink the operands of mul.
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,
MulOp->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT,
MulOp->getOperand(1));
+
+ // Madd vector size is half of the original vector size
+ SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+ // Fill the rest of the output with 0
+ SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+ return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
+}
+
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
@@ -34695,6 +34740,8 @@
if (Flags->hasVectorReduction()) {
if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
return Sad;
+ if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+ return MAdd;
}
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
======================================================
This optimization can't be used with MULU16, so may be solution is adding:
--- lib/Target/X86/X86ISelLowering.cpp (revision 300686)
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
@@ -34631,7 +34631,7 @@
return SDValue();
ShrinkMode Mode;
- if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+ if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
return SDValue();
------------------------
Intel Software Engineer
Ilia Taraban</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>