[llvm-bugs] [Bug 24413] New: [loop vectorizer] unoptimized vectorized code for induction variable

Mon Aug 10 09:47:36 PDT 2015

https://llvm.org/bugs/show_bug.cgi?id=24413

            Bug ID: 24413
           Summary: [loop vectorizer] unoptimized vectorized code for
                    induction variable
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: wmi at google.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

For the simple loop below,

testcase 1.c:
typedef struct ST {
  unsigned char u;
  unsigned char v;
} ST;

ST c[10000];

int foo(int j, int N) {
  int i;
  int total1;

  for (i = j; i < N; i++) {
    total1 += c[i].u;
  }
  return total1;
}

~/workarea/llvm-r243653/build/bin/clang -O2 -fno-unroll-loops -S 1.c

llvm generated vectorized code for the kernel loop:
.LBB0_4:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
***
        movd    %rcx, %xmm4
        pshufd  $68, %xmm4, %xmm4       # xmm4 = xmm4[0,1,0,1]
        movdqa  %xmm4, %xmm5
        paddq   %xmm1, %xmm5
        paddq   %xmm2, %xmm4
        pshufd  $78, %xmm5, %xmm5       # xmm5 = xmm5[2,3,0,1]
        movd    %xmm5, %rdi
        movd    %xmm4, %r11
        pshufd  $78, %xmm4, %xmm4       # xmm4 = xmm4[2,3,0,1]
        movd    %xmm4, %r9
***    # the code segment above is to set rdi/r11/r9 to i+1, i+2, i+3
        movzbl  c(%rcx,%rcx), %edx
        pinsrw  $0, %edx, %xmm4
        movzbl  c(%rdi,%rdi), %edx
        pinsrw  $2, %edx, %xmm4
        movzbl  c(%r11,%r11), %edx
        pinsrw  $4, %edx, %xmm4
        movzbl  c(%r9,%r9), %edx
        pinsrw  $6, %edx, %xmm4
        pand    %xmm3, %xmm4
        paddd   %xmm4, %xmm0
        addq    $4, %rcx
        addq    $-4, %rax
        jne     .LBB0_4

It splats induction variable into a xmm register, adds the xmm register with
vector [0, 1, 2, 3] and then extracts scalar element from the resulting xmm
register. It is unnecessarily complexer than just using a scalar version of
induction variable.

.LBB0_4:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
***
        leaq    1(%rcx), %rdi
        leaq    2(%rcx), %rl1
        leaq    3(%rcx), %r9
***     # the code segment above is to set rdi/r11/r9 to i+1, i+2, i+3
        movzbl  c(%rcx,%rcx), %edx
        pinsrw  $0, %edx, %xmm4
        movzbl  c(%rdi,%rdi), %edx
        pinsrw  $2, %edx, %xmm4
        movzbl  c(%r11,%r11), %edx
        pinsrw  $4, %edx, %xmm4
        movzbl  c(%r9,%r9), %edx
        pinsrw  $6, %edx, %xmm4
        pand    %xmm3, %xmm4
        paddd   %xmm4, %xmm0
        addq    $4, %rcx
        addq    $-4, %rax
        jne     .LBB0_4

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20150810/a3e9c44e/attachment.html>