[llvm-bugs] [Bug 35170] New: Auto-vectorization is too aggressive/naive about scatter/gather instructions

via llvm-bugs llvm-bugs at lists.llvm.org
Thu Nov 2 09:22:43 PDT 2017


https://bugs.llvm.org/show_bug.cgi?id=35170

            Bug ID: 35170
           Summary: Auto-vectorization is too aggressive/naive about
                    scatter/gather instructions
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: dave at znu.io
                CC: llvm-bugs at lists.llvm.org

Note: LLVM r317088

clang+llvm ToT built itself and -march=knl is way too aggressive/naive about
using vector scatter/gather instructions during auto-vectorization. For
example, consider llvm::DenseMap<...>::grow(unsigned int) [with code gen
below).

This generates a large loop that moves a ton of scalars and pointers from the
scalar domain into the vector domain, packs said scalars, and then uses
vpscatterqq to store the scalars as scalars. If the vector scatter/gather
instructions were magically more efficient than the scalar equivalents, then
maybe all of the scalar-to-vector copying and packing would be worth it -- but
they're not. The scatter/gather instructions exist to *avoid* movement of data
between the scalar and vector domains (like what the compiler generated):

 1068530:       48 8d 8a 10 fe ff ff    lea    -0x1f0(%rdx),%rcx
 1068537:       48 8d 9a 20 fe ff ff    lea    -0x1e0(%rdx),%rbx
 106853e:       48 8d aa 30 fe ff ff    lea    -0x1d0(%rdx),%rbp
 1068545:       48 8d ba 40 fe ff ff    lea    -0x1c0(%rdx),%rdi
 106854c:       4c 8d 82 50 fe ff ff    lea    -0x1b0(%rdx),%r8
 1068553:       4c 8d b2 60 fe ff ff    lea    -0x1a0(%rdx),%r14
 106855a:       4c 8d 8a 70 fe ff ff    lea    -0x190(%rdx),%r9
 1068561:       4c 8d 92 80 fe ff ff    lea    -0x180(%rdx),%r10
 1068568:       c4 c1 f9 6e ca          vmovq  %r10,%xmm1
 106856d:       c4 c1 f9 6e d1          vmovq  %r9,%xmm2
 1068572:       c5 e9 6c c9             vpunpcklqdq %xmm1,%xmm2,%xmm1
 1068576:       c4 c1 f9 6e d6          vmovq  %r14,%xmm2
 106857b:       c4 c1 f9 6e d8          vmovq  %r8,%xmm3
 1068580:       c5 e1 6c d2             vpunpcklqdq %xmm2,%xmm3,%xmm2
 1068584:       c4 e3 6d 38 c9 01       vinserti128 $0x1,%xmm1,%ymm2,%ymm1
 106858a:       c4 e1 f9 6e d7          vmovq  %rdi,%xmm2
 106858f:       c4 e1 f9 6e dd          vmovq  %rbp,%xmm3
 1068594:       c5 e1 6c d2             vpunpcklqdq %xmm2,%xmm3,%xmm2
 1068598:       c4 e1 f9 6e db          vmovq  %rbx,%xmm3
 106859d:       c4 e1 f9 6e e1          vmovq  %rcx,%xmm4
 10685a2:       c5 d9 6c db             vpunpcklqdq %xmm3,%xmm4,%xmm3
 10685a6:       c4 e3 65 38 d2 01       vinserti128 $0x1,%xmm2,%ymm3,%ymm2
 10685ac:       62 f3 ed 48 3a c9 01    vinserti64x4 $0x1,%ymm1,%zmm2,%zmm1
 10685b3:       4c 8d 82 90 fe ff ff    lea    -0x170(%rdx),%r8
 10685ba:       4c 8d 8a a0 fe ff ff    lea    -0x160(%rdx),%r9
 10685c1:       4c 8d 92 b0 fe ff ff    lea    -0x150(%rdx),%r10
 10685c8:       4c 8d b2 c0 fe ff ff    lea    -0x140(%rdx),%r14
 10685cf:       48 8d 8a d0 fe ff ff    lea    -0x130(%rdx),%rcx
 10685d6:       48 8d ba e0 fe ff ff    lea    -0x120(%rdx),%rdi
 10685dd:       48 8d aa f0 fe ff ff    lea    -0x110(%rdx),%rbp
 10685e4:       48 8d 9a 00 ff ff ff    lea    -0x100(%rdx),%rbx
 10685eb:       c4 e1 f9 6e d3          vmovq  %rbx,%xmm2
 10685f0:       c4 e1 f9 6e dd          vmovq  %rbp,%xmm3
 10685f5:       c5 e1 6c d2             vpunpcklqdq %xmm2,%xmm3,%xmm2
 10685f9:       c4 e1 f9 6e df          vmovq  %rdi,%xmm3
 10685fe:       c4 e1 f9 6e e1          vmovq  %rcx,%xmm4
 1068603:       c5 d9 6c db             vpunpcklqdq %xmm3,%xmm4,%xmm3
 1068607:       c4 e3 65 38 d2 01       vinserti128 $0x1,%xmm2,%ymm3,%ymm2
 106860d:       c4 c1 f9 6e de          vmovq  %r14,%xmm3
 1068612:       c4 c1 f9 6e e2          vmovq  %r10,%xmm4
 1068617:       c5 d9 6c db             vpunpcklqdq %xmm3,%xmm4,%xmm3
 106861b:       c4 c1 f9 6e e1          vmovq  %r9,%xmm4
 1068620:       c4 c1 f9 6e e8          vmovq  %r8,%xmm5
 1068625:       c5 d1 6c e4             vpunpcklqdq %xmm4,%xmm5,%xmm4
 1068629:       c4 e3 5d 38 db 01       vinserti128 $0x1,%xmm3,%ymm4,%ymm3
 106862f:       62 f3 e5 48 3a d2 01    vinserti64x4 $0x1,%ymm2,%zmm3,%zmm2
 1068636:       4c 8d 82 10 ff ff ff    lea    -0xf0(%rdx),%r8
 106863d:       4c 8d 8a 20 ff ff ff    lea    -0xe0(%rdx),%r9
 1068644:       4c 8d 92 30 ff ff ff    lea    -0xd0(%rdx),%r10
 106864b:       4c 8d b2 40 ff ff ff    lea    -0xc0(%rdx),%r14
 1068652:       48 8d 8a 50 ff ff ff    lea    -0xb0(%rdx),%rcx
 1068659:       48 8d ba 60 ff ff ff    lea    -0xa0(%rdx),%rdi
 1068660:       48 8d aa 70 ff ff ff    lea    -0x90(%rdx),%rbp
 1068667:       48 8d 5a 80             lea    -0x80(%rdx),%rbx
 106866b:       c4 e1 f9 6e db          vmovq  %rbx,%xmm3
 1068670:       c4 e1 f9 6e e5          vmovq  %rbp,%xmm4
 1068675:       c5 d9 6c db             vpunpcklqdq %xmm3,%xmm4,%xmm3
 1068679:       c4 e1 f9 6e e7          vmovq  %rdi,%xmm4
 106867e:       c4 e1 f9 6e e9          vmovq  %rcx,%xmm5
 1068683:       c5 d1 6c e4             vpunpcklqdq %xmm4,%xmm5,%xmm4
 1068687:       c4 e3 5d 38 db 01       vinserti128 $0x1,%xmm3,%ymm4,%ymm3
 106868d:       c4 c1 f9 6e e6          vmovq  %r14,%xmm4
 1068692:       c4 c1 f9 6e ea          vmovq  %r10,%xmm5
 1068697:       c5 d1 6c e4             vpunpcklqdq %xmm4,%xmm5,%xmm4
 106869b:       c4 c1 f9 6e e9          vmovq  %r9,%xmm5
 10686a0:       c4 c1 f9 6e f0          vmovq  %r8,%xmm6
 10686a5:       c5 c9 6c ed             vpunpcklqdq %xmm5,%xmm6,%xmm5
 10686a9:       c4 e3 55 38 e4 01       vinserti128 $0x1,%xmm4,%ymm5,%ymm4
 10686af:       62 f3 dd 48 3a db 01    vinserti64x4 $0x1,%ymm3,%zmm4,%zmm3
 10686b6:       4c 8d 42 90             lea    -0x70(%rdx),%r8
 10686ba:       4c 8d 4a a0             lea    -0x60(%rdx),%r9
 10686be:       4c 8d 52 b0             lea    -0x50(%rdx),%r10
 10686c2:       48 8d 5a c0             lea    -0x40(%rdx),%rbx
 10686c6:       48 8d 4a d0             lea    -0x30(%rdx),%rcx
 10686ca:       48 8d 7a e0             lea    -0x20(%rdx),%rdi
 10686ce:       48 8d 6a f0             lea    -0x10(%rdx),%rbp
 10686d2:       c4 e1 f9 6e e2          vmovq  %rdx,%xmm4
 10686d7:       c4 e1 f9 6e ed          vmovq  %rbp,%xmm5
 10686dc:       c5 d1 6c e4             vpunpcklqdq %xmm4,%xmm5,%xmm4
 10686e0:       c4 e1 f9 6e ef          vmovq  %rdi,%xmm5
 10686e5:       c4 e1 f9 6e f1          vmovq  %rcx,%xmm6
 10686ea:       c5 c9 6c ed             vpunpcklqdq %xmm5,%xmm6,%xmm5
 10686ee:       c4 e3 55 38 e4 01       vinserti128 $0x1,%xmm4,%ymm5,%ymm4
 10686f4:       c4 e1 f9 6e eb          vmovq  %rbx,%xmm5
 10686f9:       c4 c1 f9 6e f2          vmovq  %r10,%xmm6
 10686fe:       c5 c9 6c ed             vpunpcklqdq %xmm5,%xmm6,%xmm5
 1068702:       c4 c1 f9 6e f1          vmovq  %r9,%xmm6
 1068707:       c4 c1 f9 6e f8          vmovq  %r8,%xmm7
 106870c:       c5 c1 6c f6             vpunpcklqdq %xmm6,%xmm7,%xmm6
 1068710:       c4 e3 4d 38 ed 01       vinserti128 $0x1,%xmm5,%ymm6,%ymm5
 1068716:       62 f3 d5 48 3a e4 01    vinserti64x4 $0x1,%ymm4,%zmm5,%zmm4
 106871d:       c5 fc 46 c8             kxnorw %k0,%k0,%k1
 1068721:       62 f2 fd 49 a1 04 0d    vpscatterqq %zmm0,0x0(,%zmm1,1){%k1}
 1068728:       00 00 00 00 
 106872c:       c5 fc 46 c8             kxnorw %k0,%k0,%k1
 1068730:       62 f2 fd 49 a1 04 15    vpscatterqq %zmm0,0x0(,%zmm2,1){%k1}
 1068737:       00 00 00 00 
 106873b:       c5 fc 46 c8             kxnorw %k0,%k0,%k1
 106873f:       62 f2 fd 49 a1 04 1d    vpscatterqq %zmm0,0x0(,%zmm3,1){%k1}
 1068746:       00 00 00 00 
 106874a:       c5 fc 46 c8             kxnorw %k0,%k0,%k1
 106874e:       62 f2 fd 49 a1 04 25    vpscatterqq %zmm0,0x0(,%zmm4,1){%k1}
 1068755:       00 00 00 00 
 1068759:       48 81 c2 00 02 00 00    add    $0x200,%rdx
 1068760:       49 83 c5 e0             add    $0xffffffffffffffe0,%r13
 1068764:       0f 85 c6 fd ff ff       jne    1068530
<_ZN4llvm8DenseMapIPNS_8MCSymbolENS_14PointerIntPairIS2_Lj1EbNS_21PointerLikeTypeTraitsIS2_EENS_18PointerIntPairInfoIS2_Lj1ES5_EEEENS_12DenseMapInfoIS2_EENS_6detail12DenseMapPairIS2_S8_EEE4growEj+0x100>

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20171102/8b660b17/attachment.html>


More information about the llvm-bugs mailing list