[llvm-bugs] [Bug 48160] New: a bug report happenning in clang in loop vectorize(enable)
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Nov 12 03:46:12 PST 2020
https://bugs.llvm.org/show_bug.cgi?id=48160
Bug ID: 48160
Summary: a bug report happenning in clang in loop
vectorize(enable)
Product: clang
Version: 10.0
Hardware: PC
OS: MacOS X
Status: NEW
Severity: enhancement
Priority: P
Component: C++2a
Assignee: unassignedclangbugs at nondot.org
Reporter: ustcw0ng at mail.ustc.edu.cn
CC: blitzrakete at gmail.com, erik.pilkington at gmail.com,
llvm-bugs at lists.llvm.org, richard-llvm at metafoo.co.uk
discription:
Two version of code should both use loop vectorize(enable), the result comes
out that the first does while the second not.
They are compiled under the same environment and CXX_FLAG
version:
clang version:Apple clang version 12.0.0 (clang-1200.0.32.27)
Target: x86_64-apple-darwin19.6.0
Thread model: posix
compiler flag:
-O2 -DNDEBUG -g -Xclang -fopenmp -std=gnu++2a -march=native
-Rpass-missed=loop-vectorized(important to show the bug)
source code:
(There is a comparison, the first version produce wrong assembly, the second do
not)
(The main function is at the end)
first version:
(T = float, matrix<T> = vector<vector<T>>)
template<typename T>
vector<T> operator*(const matrix<T> &left, const vector<T> &right) {
vector<T> out(left.size());
for (size_t i = 0; i < left.size(); i++) {
T s(0);
#pragma clang loop vectorize(enable)
for (size_t j = 0; j < right.size(); j++)
s += left[i][j] * right[j];
out[i] = s;
}
return out;
}
second version:(T and matrix<T> are the same)
template<typename T>
vector<T> operator*(const matrix<T> &left, const vector<T> &right) {
vector<T> out(left.size());
for (size_t i = 0; i < left.size(); i++)
#pragma clang loop vectorize(enable)
for (size_t j = 0; j < right.size(); j++)
out[i] += left[i][j] * right[j];
return out;
}
error:
second version compiler waring:
warning: loop not vectorized: the optimizer was unable to perform the requested
transformation; the transformation might be disabled or specified as part of an
unsupported transformation ordering [-Wpass-failed=transform-warning]
for (size_t j = 0; j < right.size(); j++)
^
now look at assembly, the first version's assembly is write with enough SIMD,
the second do not!
test`operator+<float>:
0x10eaa6010 <+0>: pushq %rbp
0x10eaa6011 <+1>: movq %rsp, %rbp
0x10eaa6014 <+4>: pushq %r15
0x10eaa6016 <+6>: pushq %r14
0x10eaa6018 <+8>: pushq %r13
0x10eaa601a <+10>: pushq %r12
0x10eaa601c <+12>: pushq %rbx
0x10eaa601d <+13>: subq $0x18, %rsp
0x10eaa6021 <+17>: movq (%rsi), %rcx
0x10eaa6024 <+20>: movq 0x8(%rsi), %rax
0x10eaa6028 <+24>: vxorps %xmm0, %xmm0, %xmm0
0x10eaa602c <+28>: vmovups %xmm0, (%rdi)
0x10eaa6030 <+32>: movq %rdi, -0x30(%rbp)
0x10eaa6034 <+36>: movq $0x0, 0x10(%rdi)
0x10eaa603c <+44>: movq %rcx, -0x40(%rbp)
0x10eaa6040 <+48>: subq %rcx, %rax
0x10eaa6043 <+51>: je 0x10eaa628b ; <+635> at test.cpp
0x10eaa6049 <+57>: sarq $0x3, %rax
0x10eaa604d <+61>: movabsq $-0x5555555555555555, %r14 ; imm =
0xAAAAAAAAAAAAAAAB
0x10eaa6057 <+71>: imulq %rax, %r14
0x10eaa605b <+75>: movq %r14, %rax
0x10eaa605e <+78>: shrq $0x3e, %rax
0x10eaa6062 <+82>: jne 0x10eaa62a1 ; <+657> at test.cpp
0x10eaa6068 <+88>: movq %rdx, -0x38(%rbp)
0x10eaa606c <+92>: leaq (,%r14,4), %r12
0x10eaa6074 <+100>: movq %r12, %rdi
0x10eaa6077 <+103>: callq 0x10eaa6b14 ; symbol stub for:
operator new(unsigned long)
0x10eaa607c <+108>: movq %rax, %r13
0x10eaa607f <+111>: movq -0x30(%rbp), %r15
0x10eaa6083 <+115>: movq %rax, (%r15)
0x10eaa6086 <+118>: leaq (%rax,%r14,4), %rbx
0x10eaa608a <+122>: movq %rbx, 0x10(%r15)
0x10eaa608e <+126>: movq %rax, %rdi
0x10eaa6091 <+129>: movq %r12, %rsi
0x10eaa6094 <+132>: callq 0x10eaa6b1a ; symbol stub for:
__bzero
0x10eaa6099 <+137>: movq %rbx, 0x8(%r15)
0x10eaa609d <+141>: movq -0x38(%rbp), %rcx
0x10eaa60a1 <+145>: movq (%rcx), %rax
0x10eaa60a4 <+148>: movq 0x8(%rcx), %r12
0x10eaa60a8 <+152>: movq %r12, %rdx
0x10eaa60ab <+155>: subq %rax, %rdx
0x10eaa60ae <+158>: sarq $0x2, %rdx
0x10eaa60b2 <+162>: cmpq $0x1, %rdx
0x10eaa60b6 <+166>: movl $0x1, %r15d
0x10eaa60bc <+172>: cmovaq %rdx, %r15
0x10eaa60c0 <+176>: leaq -0x20(%r15), %r9
0x10eaa60c4 <+180>: shrq $0x5, %r9
0x10eaa60c8 <+184>: leaq 0x1(%r9), %r8
0x10eaa60cc <+188>: movq %r15, %r11
0x10eaa60cf <+191>: andq $-0x20, %r11
0x10eaa60d3 <+195>: movl %r8d, %r10d
0x10eaa60d6 <+198>: andl $0x1, %r10d
0x10eaa60da <+202>: andq $-0x2, %r8
0x10eaa60de <+206>: xorl %edi, %edi
0x10eaa60e0 <+208>: jmp 0x10eaa6103 ; <+243> at test.cpp
0x10eaa60e2 <+210>: nopw %cs:(%rax,%rax)
0x10eaa60ec <+220>: nopl (%rax)
0x10eaa60f0 <+224>: vmovss %xmm0, (%r13,%rdi,4)
0x10eaa60f7 <+231>: incq %rdi
0x10eaa60fa <+234>: cmpq %r14, %rdi
0x10eaa60fd <+237>: jae 0x10eaa628b ; <+635> at test.cpp
0x10eaa6103 <+243>: vxorps %xmm0, %xmm0, %xmm0
0x10eaa6107 <+247>: cmpq %rax, %r12
0x10eaa610a <+250>: je 0x10eaa60f0 ; <+224> at
test.cpp:19:16
0x10eaa610c <+252>: leaq (%rdi,%rdi,2), %rcx
0x10eaa6110 <+256>: movq -0x40(%rbp), %rsi
0x10eaa6114 <+260>: movq (%rsi,%rcx,8), %rsi
0x10eaa6118 <+264>: cmpq $0x1f, %r15
0x10eaa611c <+268>: ja 0x10eaa6130 ; <+288> at test.cpp
0x10eaa611e <+270>: vxorps %xmm0, %xmm0, %xmm0
0x10eaa6122 <+274>: xorl %ecx, %ecx
0x10eaa6124 <+276>: jmp 0x10eaa6270 ; <+608> at
test.cpp:18:18
0x10eaa6129 <+281>: nopl (%rax)
0x10eaa6130 <+288>: vxorps %xmm0, %xmm0, %xmm0
0x10eaa6134 <+292>: xorl %ebx, %ebx
0x10eaa6136 <+294>: vxorps %xmm1, %xmm1, %xmm1
0x10eaa613a <+298>: vxorps %xmm2, %xmm2, %xmm2
0x10eaa613e <+302>: vxorps %xmm3, %xmm3, %xmm3
0x10eaa6142 <+306>: testq %r9, %r9
0x10eaa6145 <+309>: je 0x10eaa61f4 ; <+484> at test.cpp
0x10eaa614b <+315>: movq %r8, %rcx
0x10eaa614e <+318>: nop
-> 0x10eaa6150 <+320>: vmovups (%rsi,%rbx,4), %ymm4
0x10eaa6155 <+325>: vmovups 0x20(%rsi,%rbx,4), %ymm5
0x10eaa615b <+331>: vmovups 0x40(%rsi,%rbx,4), %ymm6
0x10eaa6161 <+337>: vmovups 0x60(%rsi,%rbx,4), %ymm7
0x10eaa6167 <+343>: vmulps (%rax,%rbx,4), %ymm4, %ymm4
0x10eaa616c <+348>: vaddps %ymm4, %ymm0, %ymm0
0x10eaa6170 <+352>: vmulps 0x20(%rax,%rbx,4), %ymm5, %ymm4
0x10eaa6176 <+358>: vaddps %ymm4, %ymm1, %ymm1
0x10eaa617a <+362>: vmulps 0x40(%rax,%rbx,4), %ymm6, %ymm4
0x10eaa6180 <+368>: vaddps %ymm4, %ymm2, %ymm2
0x10eaa6184 <+372>: vmulps 0x60(%rax,%rbx,4), %ymm7, %ymm4
0x10eaa618a <+378>: vaddps %ymm4, %ymm3, %ymm3
0x10eaa618e <+382>: vmovups 0x80(%rsi,%rbx,4), %ymm4
0x10eaa6197 <+391>: vmovups 0xa0(%rsi,%rbx,4), %ymm5
0x10eaa61a0 <+400>: vmovups 0xc0(%rsi,%rbx,4), %ymm6
0x10eaa61a9 <+409>: vmovups 0xe0(%rsi,%rbx,4), %ymm7
0x10eaa61b2 <+418>: vmulps 0x80(%rax,%rbx,4), %ymm4, %ymm4
0x10eaa61bb <+427>: vaddps %ymm4, %ymm0, %ymm0
0x10eaa61bf <+431>: vmulps 0xa0(%rax,%rbx,4), %ymm5, %ymm4
0x10eaa61c8 <+440>: vaddps %ymm4, %ymm1, %ymm1
0x10eaa61cc <+444>: vmulps 0xc0(%rax,%rbx,4), %ymm6, %ymm4
0x10eaa61d5 <+453>: vaddps %ymm4, %ymm2, %ymm2
0x10eaa61d9 <+457>: vmulps 0xe0(%rax,%rbx,4), %ymm7, %ymm4
0x10eaa61e2 <+466>: vaddps %ymm4, %ymm3, %ymm3
0x10eaa61e6 <+470>: addq $0x40, %rbx
0x10eaa61ea <+474>: addq $-0x2, %rcx
0x10eaa61ee <+478>: jne 0x10eaa6150 ; <+320> at
test.cpp:18:18
0x10eaa61f4 <+484>: testq %r10, %r10
0x10eaa61f7 <+487>: je 0x10eaa6237 ; <+551> at
test.cpp:17:9
0x10eaa61f9 <+489>: vmovups (%rsi,%rbx,4), %ymm4
0x10eaa61fe <+494>: vmovups 0x20(%rsi,%rbx,4), %ymm5
0x10eaa6204 <+500>: vmovups 0x40(%rsi,%rbx,4), %ymm6
0x10eaa620a <+506>: vmovups 0x60(%rsi,%rbx,4), %ymm7
0x10eaa6210 <+512>: vmulps 0x60(%rax,%rbx,4), %ymm7, %ymm7
0x10eaa6216 <+518>: vaddps %ymm7, %ymm3, %ymm3
0x10eaa621a <+522>: vmulps 0x40(%rax,%rbx,4), %ymm6, %ymm6
0x10eaa6220 <+528>: vaddps %ymm6, %ymm2, %ymm2
0x10eaa6224 <+532>: vmulps 0x20(%rax,%rbx,4), %ymm5, %ymm5
0x10eaa622a <+538>: vaddps %ymm5, %ymm1, %ymm1
0x10eaa622e <+542>: vmulps (%rax,%rbx,4), %ymm4, %ymm4
0x10eaa6233 <+547>: vaddps %ymm4, %ymm0, %ymm0
0x10eaa6237 <+551>: vaddps %ymm0, %ymm1, %ymm0
0x10eaa623b <+555>: vaddps %ymm0, %ymm2, %ymm0
0x10eaa623f <+559>: vaddps %ymm0, %ymm3, %ymm0
0x10eaa6243 <+563>: vextractf128 $0x1, %ymm0, %xmm1
0x10eaa6249 <+569>: vaddps %xmm1, %xmm0, %xmm0
0x10eaa624d <+573>: vpermilpd $0x1, %xmm0, %xmm1 ; xmm1 = xmm0[1,0]
0x10eaa6253 <+579>: vaddps %xmm1, %xmm0, %xmm0
0x10eaa6257 <+583>: vmovshdup %xmm0, %xmm1 ; xmm1 =
xmm0[1,1,3,3]
0x10eaa625b <+587>: vaddss %xmm1, %xmm0, %xmm0
0x10eaa625f <+591>: movq %r11, %rcx
0x10eaa6262 <+594>: cmpq %r11, %r15
0x10eaa6265 <+597>: je 0x10eaa60f0 ; <+224> at
test.cpp:19:16
0x10eaa626b <+603>: nopl (%rax,%rax)
0x10eaa6270 <+608>: vmovss (%rsi,%rcx,4), %xmm1 ; xmm1 =
mem[0],zero,zero,zero
0x10eaa6275 <+613>: vmulss (%rax,%rcx,4), %xmm1, %xmm1
0x10eaa627a <+618>: vaddss %xmm1, %xmm0, %xmm0
0x10eaa627e <+622>: incq %rcx
0x10eaa6281 <+625>: cmpq %rdx, %rcx
0x10eaa6284 <+628>: jb 0x10eaa6270 ; <+608> at
test.cpp:18:18
0x10eaa6286 <+630>: jmp 0x10eaa60f0 ; <+224> at
test.cpp:19:16
0x10eaa628b <+635>: movq -0x30(%rbp), %rax
0x10eaa628f <+639>: addq $0x18, %rsp
0x10eaa6293 <+643>: popq %rbx
0x10eaa6294 <+644>: popq %r12
0x10eaa6296 <+646>: popq %r13
0x10eaa6298 <+648>: popq %r14
0x10eaa629a <+650>: popq %r15
0x10eaa629c <+652>: popq %rbp
0x10eaa629d <+653>: vzeroupper
0x10eaa62a0 <+656>: retq
0x10eaa62a1 <+657>: movq -0x30(%rbp), %rdi
0x10eaa62a5 <+661>: callq 0x10eaa6aa8 ; symbol stub for:
std::__1::__vector_base_common<true>::__throw_length_error() const
0x10eaa62aa <+666>: ud2
0x10eaa62ac <+668>: movq %rax, %rbx
0x10eaa62af <+671>: movq -0x30(%rbp), %rax
0x10eaa62b3 <+675>: movq (%rax), %rdi
0x10eaa62b6 <+678>: testq %rdi, %rdi
0x10eaa62b9 <+681>: je 0x10eaa62c8 ; <+696> at new
0x10eaa62bb <+683>: movq -0x30(%rbp), %rax
0x10eaa62bf <+687>: movq %rdi, 0x8(%rax)
0x10eaa62c3 <+691>: callq 0x10eaa6b0e ; symbol stub for:
operator delete(void*)
0x10eaa62c8 <+696>: movq %rbx, %rdi
0x10eaa62cb <+699>: callq 0x10eaa6aa2 ; symbol stub for:
_Unwind_Resume
0x10eaa62d0 <+704>: ud2
0x10eaa62d2 <+706>: nopw %cs:(%rax,%rax)
0x10eaa62dc <+716>: nopl (%rax)
test`operator+<float>:
0x10de2f1c0 <+0>: pushq %rbp
0x10de2f1c1 <+1>: movq %rsp, %rbp
0x10de2f1c4 <+4>: pushq %r15
0x10de2f1c6 <+6>: pushq %r14
0x10de2f1c8 <+8>: pushq %r13
0x10de2f1ca <+10>: pushq %r12
0x10de2f1cc <+12>: pushq %rbx
0x10de2f1cd <+13>: subq $0x18, %rsp
0x10de2f1d1 <+17>: movq %rdi, %r14
0x10de2f1d4 <+20>: movq (%rsi), %r13
0x10de2f1d7 <+23>: movq 0x8(%rsi), %rax
0x10de2f1db <+27>: vxorps %xmm0, %xmm0, %xmm0
0x10de2f1df <+31>: vmovups %xmm0, (%rdi)
0x10de2f1e3 <+35>: movq $0x0, 0x10(%rdi)
0x10de2f1eb <+43>: subq %r13, %rax
0x10de2f1ee <+46>: je 0x10de2f29d ; <+221> at
test.cpp:20:1
0x10de2f1f4 <+52>: sarq $0x3, %rax
0x10de2f1f8 <+56>: movabsq $-0x5555555555555555, %r15 ; imm =
0xAAAAAAAAAAAAAAAB
0x10de2f202 <+66>: imulq %rax, %r15
0x10de2f206 <+70>: movq %r15, %rax
0x10de2f209 <+73>: shrq $0x3e, %rax
0x10de2f20d <+77>: jne 0x10de2f2af ; <+239> [inlined]
std::__1::vector<float, std::__1::allocator<float> >::__vallocate(unsigned
long) at vector:1125
0x10de2f213 <+83>: movq %rdx, -0x38(%rbp)
0x10de2f217 <+87>: leaq (,%r15,4), %rdi
0x10de2f21f <+95>: movq %rdi, -0x30(%rbp)
0x10de2f223 <+99>: callq 0x10de2fb14 ; symbol stub for:
operator new(unsigned long)
0x10de2f228 <+104>: movq %rax, %rbx
0x10de2f22b <+107>: movq %rax, (%r14)
0x10de2f22e <+110>: leaq (%rax,%r15,4), %r12
0x10de2f232 <+114>: movq %r12, 0x10(%r14)
0x10de2f236 <+118>: movq %rax, %rdi
0x10de2f239 <+121>: movq -0x30(%rbp), %rsi
0x10de2f23d <+125>: callq 0x10de2fb1a ; symbol stub for:
__bzero
0x10de2f242 <+130>: movq %r12, 0x8(%r14)
0x10de2f246 <+134>: movq -0x38(%rbp), %rcx
0x10de2f24a <+138>: movq (%rcx), %rax
0x10de2f24d <+141>: movq 0x8(%rcx), %r8
0x10de2f251 <+145>: movq %r8, %rdx
0x10de2f254 <+148>: subq %rax, %rdx
0x10de2f257 <+151>: sarq $0x2, %rdx
0x10de2f25b <+155>: xorl %esi, %esi
0x10de2f25d <+157>: jmp 0x10de2f268 ; <+168> at test.cpp
0x10de2f25f <+159>: nop
0x10de2f260 <+160>: incq %rsi
0x10de2f263 <+163>: cmpq %r15, %rsi
0x10de2f266 <+166>: jae 0x10de2f29d ; <+221> at
test.cpp:20:1
0x10de2f268 <+168>: cmpq %rax, %r8
0x10de2f26b <+171>: je 0x10de2f260 ; <+160> at
test.cpp:14:42
0x10de2f26d <+173>: leaq (%rsi,%rsi,2), %rcx
0x10de2f271 <+177>: movq (%r13,%rcx,8), %rdi
-> 0x10de2f276 <+182>: vmovss (%rbx,%rsi,4), %xmm0 ; xmm0 =
mem[0],zero,zero,zero
0x10de2f27b <+187>: xorl %ecx, %ecx
0x10de2f27d <+189>: nopl (%rax)
0x10de2f280 <+192>: vmovss (%rdi,%rcx,4), %xmm1 ; xmm1 =
mem[0],zero,zero,zero
0x10de2f285 <+197>: vmulss (%rax,%rcx,4), %xmm1, %xmm1
0x10de2f28a <+202>: vaddss %xmm0, %xmm1, %xmm0
0x10de2f28e <+206>: vmovss %xmm0, (%rbx,%rsi,4)
0x10de2f293 <+211>: incq %rcx
0x10de2f296 <+214>: cmpq %rdx, %rcx
0x10de2f299 <+217>: jb 0x10de2f280 ; <+192> at
test.cpp:17:23
0x10de2f29b <+219>: jmp 0x10de2f260 ; <+160> at
test.cpp:14:42
0x10de2f29d <+221>: movq %r14, %rax
0x10de2f2a0 <+224>: addq $0x18, %rsp
0x10de2f2a4 <+228>: popq %rbx
0x10de2f2a5 <+229>: popq %r12
0x10de2f2a7 <+231>: popq %r13
0x10de2f2a9 <+233>: popq %r14
0x10de2f2ab <+235>: popq %r15
0x10de2f2ad <+237>: popq %rbp
0x10de2f2ae <+238>: retq
0x10de2f2af <+239>: movq %r14, %rdi
0x10de2f2b2 <+242>: callq 0x10de2faa8 ; symbol stub for:
std::__1::__vector_base_common<true>::__throw_length_error() const
0x10de2f2b7 <+247>: ud2
0x10de2f2b9 <+249>: movq %rax, %rbx
0x10de2f2bc <+252>: movq (%r14), %rdi
0x10de2f2bf <+255>: testq %rdi, %rdi
0x10de2f2c2 <+258>: je 0x10de2f2cd ; <+269> at new
0x10de2f2c4 <+260>: movq %rdi, 0x8(%r14)
0x10de2f2c8 <+264>: callq 0x10de2fb0e ; symbol stub for:
operator delete(void*)
0x10de2f2cd <+269>: movq %rbx, %rdi
0x10de2f2d0 <+272>: callq 0x10de2faa2 ; symbol stub for:
_Unwind_Resume
0x10de2f2d5 <+277>: ud2
0x10de2f2d7 <+279>: nopw (%rax,%rax)
main:
int main(){
using T = float;
constexpr size_t size = 1000;
matrix<T> to_decompose(size, vector<T>(size));
vector<T> to_solve(size);
/*use random to generate to_solve, uncomment this if you want to
std::random_device rd; //Will be used to obtain a seed for the random
number engine
std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
std::uniform_real_distribution<> dis(-1.0, 1.0);
for (size_t i = 0; i < size; ++i){
if (i != 0)
to_decompose[i][i - 1] = to_decompose[i - 1][i]= 1;
to_decompose[i][i] = 10;
}
for (size_t i = 0; i < size; ++i) {
// Use dis to transform the random unsigned int generated by gen into a
// T in [-1, 1). Each call to dis(gen) generates a new random T
to_solve[i] = dis(gen);
}
*/
solved = to_decompose * to_solve;
cout << solved[0] << endl;
}
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20201112/0a795476/attachment-0001.html>
More information about the llvm-bugs
mailing list