[llvm-bugs] [Bug 49718] New: Loop gets vectorized into AVX2 but vectorized form is unreachable

Wed Mar 24 16:15:34 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=49718

            Bug ID: 49718
           Summary: Loop gets vectorized into AVX2 but vectorized form is
                    unreachable
           Product: new-bugs
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: spencer at jacknife.org
                CC: htmldeveloper at gmail.com, llvm-bugs at lists.llvm.org

While trying to construct a simple loop that gets auto-vectorized into AVX2
instructions, I came across an odd case.  (Bear in mind I am not an expert in
loop vectorization.)

#include <cstdint>
#include <stdexcept>
#include <string>
#include <string_view>

namespace
{
  void decode(std::string& value,
              const std::string_view key)
  {
    if(__builtin_expect(key.size() != 0x20, false))
    {
      throw std::invalid_argument{"wrong size"};
    }

    std::string_view::size_type keyPos = 0;
    for(auto& ch : value)
    {
      auto& byte = reinterpret_cast<std::uint8_t&>(ch);

      byte ^= key[keyPos & 0x1f];
      ++keyPos;
    }
  }
}

int
main(const int argc, const char* argv[])
{
  std::string arg{argv[1]};
  decode(arg, "01234567012345670123456701234567");
  std::puts(arg.c_str());
}

With clang-13 (Debian 10 nightly
1:13~++20210321102631+2554b95db57c-1~exp1~20210321213413.1383), this produces
code like this:

dispatch:
   0x0000000000401229 <+169>:   lea    (%rax,%r10,1),%r11
   0x000000000040122d <+173>:   xor    %ecx,%ecx
   0x000000000040122f <+175>:   cmp    $0x8,%r10
   0x0000000000401233 <+179>:   jb     0x401256 <main(int, char const**)+214>
   0x0000000000401235 <+181>:   lea    -0x1(%r10),%rdx
=> 0x0000000000401239 <+185>:   cmp    $0x1f,%rdx
   0x000000000040123d <+189>:   ja     0x401256 <main(int, char const**)+214>
   0x000000000040123f <+191>:   lea    0x402040(%r10),%rdx
   0x0000000000401246 <+198>:   cmp    %rdx,%rax
   0x0000000000401249 <+201>:   jae    0x4012b2 <main(int, char const**)+306>
   0x000000000040124b <+203>:   cmp    $0x402040,%r11
   0x0000000000401252 <+210>:   jbe    0x4012b2 <main(int, char const**)+306>
unvectorized loop:
   0x0000000000401256 <+214>:   mov    %rax,%rdi
   0x0000000000401259 <+217>:   nopl   0x0(%rax)
   0x0000000000401260 <+224>:   mov    %ecx,%eax
   0x0000000000401262 <+226>:   and    $0x1f,%eax
   0x0000000000401265 <+229>:   movzbl 0x402040(%rax),%eax
   0x000000000040126c <+236>:   xor    %al,(%rdi)
   0x000000000040126e <+238>:   add    $0x1,%rcx
   0x0000000000401272 <+242>:   add    $0x1,%rdi
   0x0000000000401276 <+246>:   cmp    %rdi,%r11
   0x0000000000401279 <+249>:   jne    0x401260 <main(int, char const**)+224>

vectorized unrolled:
   0x00000000004012b2 <+306>:   cmp    $0x80,%r10
   0x00000000004012b9 <+313>:   jae    0x4012c2 <main(int, char const**)+322>
   0x00000000004012bb <+315>:   xor    %ecx,%ecx
   0x00000000004012bd <+317>:   jmpq   0x401394 <main(int, char const**)+532>
   0x00000000004012c2 <+322>:   mov    %r10,%rcx
   0x00000000004012c5 <+325>:   and    $0xffffffffffffff80,%rcx
   0x00000000004012c9 <+329>:   lea    -0x80(%rcx),%rdx
   0x00000000004012cd <+333>:   mov    %rdx,%rdi
   0x00000000004012d0 <+336>:   shr    $0x7,%rdi
   0x00000000004012d4 <+340>:   add    $0x1,%rdi
   0x00000000004012d8 <+344>:   mov    %edi,%r9d
   0x00000000004012db <+347>:   and    $0x3,%r9d
   0x00000000004012df <+351>:   cmp    $0x180,%rdx
   0x00000000004012e6 <+358>:   jae    0x4012ec <main(int, char const**)+364>
   0x00000000004012e8 <+360>:   xor    %ebx,%ebx
   0x00000000004012ea <+362>:   jmp    0x40134d <main(int, char const**)+461>
   0x00000000004012ec <+364>:   and    $0xfffffffffffffffc,%rdi
   0x00000000004012f0 <+368>:   neg    %rdi
   0x00000000004012f3 <+371>:   xor    %ebx,%ebx
   0x00000000004012f5 <+373>:   vbroadcastsd 0xd0a(%rip),%ymm0        #
0x402008
   0x00000000004012fe <+382>:   xchg   %ax,%ax
   0x0000000000401300 <+384>:   vxorps (%rax,%rbx,1),%ymm0,%ymm1
   0x0000000000401305 <+389>:   vmovups %ymm1,(%rax,%rbx,1)
   0x000000000040130a <+394>:   vxorps 0x80(%rax,%rbx,1),%ymm0,%ymm1
   0x0000000000401313 <+403>:   vmovups %ymm1,0x80(%rax,%rbx,1)
   0x000000000040131c <+412>:   vxorps 0x100(%rax,%rbx,1),%ymm0,%ymm1
   0x0000000000401325 <+421>:   vmovups %ymm1,0x100(%rax,%rbx,1)
   0x000000000040132e <+430>:   vxorps 0x180(%rax,%rbx,1),%ymm0,%ymm1
   0x0000000000401337 <+439>:   vmovups %ymm1,0x180(%rax,%rbx,1)
   0x0000000000401340 <+448>:   add    $0x200,%rbx
   0x0000000000401347 <+455>:   add    $0x4,%rdi
   0x000000000040134b <+459>:   jne    0x401300 <main(int, char const**)+384>

vectorized rolled up:
   0x000000000040134d <+461>:   test   %r9,%r9
   0x0000000000401350 <+464>:   je     0x401385 <main(int, char const**)+517>
   0x0000000000401352 <+466>:   lea    (%rbx,%rax,1),%rdx
   0x0000000000401356 <+470>:   add    $0x20,%rdx
   0x000000000040135a <+474>:   shl    $0x7,%r9
   0x000000000040135e <+478>:   xor    %edi,%edi
   0x0000000000401360 <+480>:   vmovaps 0xcb8(%rip),%ymm0        # 0x402020
   0x0000000000401368 <+488>:   nopl   0x0(%rax,%rax,1)
   0x0000000000401370 <+496>:   vxorps -0x20(%rdx,%rdi,1),%ymm0,%ymm1
   0x0000000000401376 <+502>:   vmovups %ymm1,-0x20(%rdx,%rdi,1)
   0x000000000040137c <+508>:   sub    $0xffffffffffffff80,%rdi
   0x0000000000401380 <+512>:   cmp    %rdi,%r9
   0x0000000000401383 <+515>:   jne    0x401370 <main(int, char const**)+496>

It looks like the vectorization is great, except take a close look at the
dispatch code.  Jumps to +214 here are dispatches to the slow byte-by-byte
path:

   0x0000000000401229 <+169>:   lea    (%rax,%r10,1),%r11
   0x000000000040122d <+173>:   xor    %ecx,%ecx
   0x000000000040122f <+175>:   cmp    $0x8,%r10
   0x0000000000401233 <+179>:   jb     0x401256 <main(int, char const**)+214>
   0x0000000000401235 <+181>:   lea    -0x1(%r10),%rdx
=> 0x0000000000401239 <+185>:   cmp    $0x1f,%rdx
   0x000000000040123d <+189>:   ja     0x401256 <main(int, char const**)+214>

$r10 here is the length of the input string . . . but so is $rdx in the second
check.  So I am pretty sure this says "if length < 8 goto slow" and "if length
> 16 goto slow".  That leaves only lengths 9 through 15, which will then not
meet the requirements of the vectorized forms (ending up at +512).  So, I think
the vectorized forms are unreachable.

I saw similar results with clang 11, but it was even more clearly unreachable. 
In that case, if I'm reading things right, in order to select vectorization,
the dispatch requires the length to be both >= 128 and <= 16, which is
impossible:

clang-11 dispatch:
   0x0000000000401229 <+153>:   cmp    $0x80,%r8
   0x0000000000401230 <+160>:   jb     0x401330 <main(int, char const**)+416>
   0x0000000000401236 <+166>:   lea    -0x1(%r8),%rdx
   0x000000000040123a <+170>:   cmp    $0x1f,%rdx
   0x000000000040123e <+174>:   ja     0x401330 <main(int, char const**)+416>

I've stepped through the code for both clang 11 and 13 with various (long)
inputs and I have been unable to find a pathway that leads to any of the AVX2
forms.

It would be one thing if the loop couldn't be vectorized, but the fact that it
generates vectorized forms and then can't use them seems like a bug.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210324/1d47f32b/attachment-0001.html>