[llvm-bugs] [Bug 48078] New: __m256i (a+b*2)+b (vpmullw; vpaddw) is slower than (b+a*2)+a (3 vpaddw)
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Nov 4 15:20:51 PST 2020
https://bugs.llvm.org/show_bug.cgi?id=48078
Bug ID: 48078
Summary: __m256i (a+b*2)+b (vpmullw;vpaddw) is slower than
(b+a*2)+a (3 vpaddw)
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: i at maskray.me
CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
llvm-dev at redking.me.uk, pengfei.wang at intel.com,
spatel+llvm at rotateright.com
Discovered by JP Maaninen
#include <immintrin.h>
__m256i Slow(__m256i a, __m256i b) {
__m256i c = _mm256_add_epi16(a, _mm256_slli_epi16(a, 1));
return _mm256_add_epi16(c, b);
}
vpmullw .LCPI0_0(%rip), %ymm0, %ymm0
vpaddw %ymm1, %ymm0, %ymm0
retq
__m256i Fast(__m256i a, __m256i b) {
__m256i c = _mm256_add_epi16(b, _mm256_slli_epi16(a, 1));
return _mm256_add_epi16(c, a);
}
vpaddw %ymm0, %ymm0, %ymm2
vpaddw %ymm0, %ymm1, %ymm0
vpaddw %ymm2, %ymm0, %ymm0
retq
----
This is either instcombine's problem or the backend's lack of optimization.
define dso_local <4 x i64> @_Z4SlowDv4_xS_(<4 x i64> %a, <4 x i64> %b)
local_unnamed_addr #0 {
entry:
%0 = bitcast <4 x i64> %a to <16 x i16>
%1 = shl <16 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16
1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%add.i5 = add <16 x i16> %1, %0
%2 = bitcast <16 x i16> %add.i5 to <4 x i64>
%3 = bitcast <4 x i64> %b to <16 x i16>
%add.i = add <16 x i16> %3, %add.i5
%4 = bitcast <16 x i16> %add.i to <4 x i64>
ret <4 x i64> %4
}
attributes #0 = { norecurse nounwind readnone uwtable
"disable-tail-calls"="false" "frame-pointer"="none"
"less-precise-fpmad"="false" "min-legal-vector-width"="256"
"no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="haswell"
"target-features"="+avx,+avx2,+bmi,+bmi2,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
"unsafe-fp-math"="false" "use-soft-float"="false" }
opt -passes=instcombine -S generates
%add.i5 = mul <16 x i16> %0, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16
3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
which will lower to VPMULLWYrm in X86ISelDAGToDAG and sticks after every
codegen pass.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20201104/a1f5611a/attachment-0001.html>
More information about the llvm-bugs
mailing list