<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Disabling gep merging pessimizes the code after loop vectorization"

   href="https://llvm.org/bugs/show_bug.cgi?id=23580">23580</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Disabling gep merging pessimizes the code after loop vectorization

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>wmi@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvmbugs@cs.uiuc.edu

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Created <span class=""><a href="attachment.cgi?id=14343" name="attach_14343" title="testcase 1.cc">attachment 14343</a> <a href="attachment.cgi?id=14343&action=edit" title="testcase 1.cc">[details]</a></span>

testcase 1.cc

Gep merging was mostly disabled in r235455 and that was good for most of the

cases. However, I found a case that disabling gep merging will pessimize the

code after loop vectorization. It may be better to relax the constraint of gep

merging a little and allow the src gep with single use to be merged with dest

gep in the same bb.

For the kernel loop (marked in testcase 1.cc), without gep merging, loop

vectorization cannot recognize the ptrs of the load between consecutive loop

iterations are consecutive. That is because

LoopVectorizationLegality::isConsecutivePtr only check one level gep to find

the induction information. The problem will pessimize the vectorization code a

lot in this case.

for.body:                                         ; preds = %scalar.ph,

%for.body

  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [

%bc.trunc.resume.val, %scalar.ph ]

  %arrayidx16 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64

%indvars.iv

  %ival = getelementptr inbounds %struct.B, %struct.B* %arrayidx16, i64 0, i32

0

  %72 = load i16, i16* %ival, align 2      ===> consecutive load

  %conv17 = sext i16 %72 to i32

  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

  %arrayidx19 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64

%indvars.iv.next

  %ival20 = getelementptr inbounds %struct.B, %struct.B* %arrayidx19, i64 0,

i32 0

  %73 = load i16, i16* %ival20, align 2    ===> consecutive load

  %conv21 = sext i16 %73 to i32

  %add22 = add nsw i32 %conv21, %conv17

  %mul = mul nsw i32 %add22, %11

  %add23 = add nsw i32 %mul, %conv7

  %shr = ashr i32 %add23, %conv

  %ival26 = getelementptr inbounds %struct.B, %struct.B* %call6, i64

%indvars.iv, i32 0

  %74 = load i16, i16* %ival26, align 2

  %conv2783 = zext i16 %74 to i32

  %sub = sub i32 %conv2783, %shr

  %conv28 = trunc i32 %sub to i16

  store i16 %conv28, i16* %ival26, align 2

  %lftr.wideiv = trunc i64 %indvars.iv.next to i32

  %exitcond = icmp eq i32 %lftr.wideiv, %2

  br i1 %exitcond, label %for.cond.loopexit98, label %for.body, !llvm.loop !7

After gep merging:

for.body:                                         ; preds =

%for.body.preheader, %for.body

  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1,

%for.body.preheader ]

  %ival = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64

%indvars.iv, i32 0

  %12 = load i16, i16* %ival, align 2

  %conv17 = sext i16 %12 to i32

  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

  %ival20 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64

%indvars.iv.next, i32 0

  %13 = load i16, i16* %ival20, align 2

  %conv21 = sext i16 %13 to i32

  %add22 = add nsw i32 %conv21, %conv17

  %mul = mul nsw i32 %add22, %11

  %add23 = add nsw i32 %mul, %conv7

  %shr = ashr i32 %add23, %conv

  %ival26 = getelementptr inbounds %struct.B, %struct.B* %call6, i64

%indvars.iv, i32 0

  %14 = load i16, i16* %ival26, align 2

  %conv2783 = zext i16 %14 to i32

  %sub = sub i32 %conv2783, %shr

  %conv28 = trunc i32 %sub to i16

  store i16 %conv28, i16* %ival26, align 2

  %lftr.wideiv = trunc i64 %indvars.iv.next to i32

  %exitcond = icmp eq i32 %lftr.wideiv, %2

  br i1 %exitcond, label %for.cond.loopexit98, label %for.body, !llvm.loop !2

vectorization code without gep merging:

.LBB0_19:                               # %vector.body

        xorps   %xmm3, %xmm3

        movss   %xmm10, %xmm3           # xmm3 = xmm10[0],xmm3[1,2,3]

        leaq    1(%r12), %rdi

        movd    %rdi, %xmm4

        pshufd  $68, %xmm4, %xmm5       # xmm5 = xmm4[0,1,0,1]

        movl    $1, %edx

        movd    %rdx, %xmm4

        pslldq  $8, %xmm4               # xmm4 =

zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]

        paddq   %xmm5, %xmm4

        movdqa  %xmm5, %xmm6

        paddq   %xmm7, %xmm6

        pshufd  $78, %xmm4, %xmm4       # xmm4 = xmm4[2,3,0,1]

        movd    %xmm4, %rdi

        movd    %xmm6, %rdx

        pshufd  $78, %xmm6, %xmm4       # xmm4 = xmm6[2,3,0,1]

        movd    %xmm4, %rsi

        pinsrw  $0, (%r10,%r12,2), %xmm4

        pinsrw  $2, (%r8,%rdi,2), %xmm4

        pinsrw  $4, (%r8,%rdx,2), %xmm4

        pinsrw  $6, (%r8,%rsi,2), %xmm4

        pslld   $16, %xmm4

        psrad   $16, %xmm4

        movdqa  %xmm5, %xmm6

        paddq   %xmm1, %xmm6

        paddq   %xmm2, %xmm5

        movd    %xmm5, %rdx

        pshufd  $78, %xmm5, %xmm5       # xmm5 = xmm5[2,3,0,1]

        movd    %xmm5, %rsi

        movd    %xmm6, %rdi

        pshufd  $78, %xmm6, %xmm5       # xmm5 = xmm6[2,3,0,1]

        movq    %r10, %r11

        movd    %xmm5, %r10

        pinsrw  $0, (%r8,%rdx,2), %xmm5

        pinsrw  $2, (%r8,%rsi,2), %xmm5

        pinsrw  $4, (%r8,%rdi,2), %xmm5

        pinsrw  $6, (%r8,%r10,2), %xmm5

        movq    %r11, %r10

        pslld   $16, %xmm5

        psrad   $16, %xmm5

        paddd   %xmm4, %xmm5

        pshufd  $245, %xmm5, %xmm4      # xmm4 = xmm5[1,1,3,3]

        pmuludq %xmm0, %xmm5

        pshufd  $232, %xmm5, %xmm5      # xmm5 = xmm5[0,2,2,3]

        pshufd  $245, %xmm0, %xmm6      # xmm6 = xmm0[1,1,3,3]

        pmuludq %xmm4, %xmm6

        pshufd  $232, %xmm6, %xmm4      # xmm4 = xmm6[0,2,2,3]

        punpckldq       %xmm4, %xmm5    # xmm5 =

xmm5[0],xmm4[0],xmm5[1],xmm4[1]

        paddd   %xmm9, %xmm5

        psrad   %xmm3, %xmm5

        movq    2(%rax,%r12,2), %xmm3   # xmm3 = mem[0],zero

        punpcklwd       %xmm8, %xmm3    # xmm3 =

xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]

        psubw   %xmm5, %xmm3

        pshuflw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3,4,5,6,7]

        pshufhw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,1,2,3,4,6,6,7]

        pshufd  $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3]

        movq    %xmm3, 2(%rax,%r12,2)

        addq    $4, %r12

        cmpq    %r12, %r9

        jne     .LBB0_19

vectorization code with gep merging:

.LBB0_19:                               # %vector.body

        xorps   %xmm3, %xmm3

        movss   %xmm2, %xmm3            # xmm3 = xmm2[0],xmm3[1,2,3]

        movq    -2(%rdx), %xmm4         # xmm4 = mem[0],zero

        punpcklwd       %xmm4, %xmm4    # xmm4 = xmm4[0,0,1,1,2,2,3,3]

        psrad   $16, %xmm4

        movq    (%rdx), %xmm5           # xmm5 = mem[0],zero

        punpcklwd       %xmm5, %xmm5    # xmm5 = xmm5[0,0,1,1,2,2,3,3]

        psrad   $16, %xmm5

        paddd   %xmm4, %xmm5

        pshufd  $245, %xmm5, %xmm4      # xmm4 = xmm5[1,1,3,3]

        pmuludq %xmm0, %xmm5

        pshufd  $232, %xmm5, %xmm5      # xmm5 = xmm5[0,2,2,3]

        pshufd  $245, %xmm0, %xmm6      # xmm6 = xmm0[1,1,3,3]

        pmuludq %xmm4, %xmm6

        pshufd  $232, %xmm6, %xmm4      # xmm4 = xmm6[0,2,2,3]

        punpckldq       %xmm4, %xmm5    # xmm5 =

xmm5[0],xmm4[0],xmm5[1],xmm4[1]

        paddd   %xmm1, %xmm5

        psrad   %xmm3, %xmm5

        movq    (%rsi), %xmm3           # xmm3 = mem[0],zero

        punpcklwd       %xmm7, %xmm3    # xmm3 =

xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]

        psubw   %xmm5, %xmm3

        pshuflw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3,4,5,6,7]

        pshufhw $232, %xmm3, %xmm3      # xmm3 = xmm3[0,1,2,3,4,6,6,7]

        pshufd  $232, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3]

        movq    %xmm3, (%rsi)

        addq    $8, %rdx

        addq    $8, %rsi

        addq    $-4, %rdi

        jne     .LBB0_19</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>