<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - [Loop Vectorizer] Bad code generated for stores in AVX 512"

   href="https://bugs.llvm.org/show_bug.cgi?id=39460">39460</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[Loop Vectorizer] Bad code generated for stores in AVX 512

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>tools

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>opt

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>anna@azul.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>For the test case below, we generate bad code after vectorization: 

cat repro.ll 

; ModuleID = 'trunk2.ll'

source_filename = "trunk2.ll"

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"

target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: uwtable

define void @ham() #0 !prof !0 {

bb:

  br label %bb1

bb1:                                              ; preds = %bb1, %bb

  %tmp = phi i64 [ %tmp10, %bb1 ], [ 2, %bb ]

  %tmp2 = lshr exact i64 %tmp, 1

  %tmp3 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)*

addrspace(1)* undef, i64 %tmp2

  store i8 addrspace(1)* undef, i8 addrspace(1)* addrspace(1)* %tmp3, align 8

  %tmp4 = add nuw nsw i64 %tmp, 2

  %tmp5 = lshr exact i64 %tmp4, 1

  %tmp6 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)*

addrspace(1)* undef, i64 %tmp5

  store i8 addrspace(1)* undef, i8 addrspace(1)* addrspace(1)* %tmp6, align 8

  %tmp7 = add nuw nsw i64 %tmp, 4

  %tmp8 = lshr exact i64 %tmp7, 1

  %tmp9 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)*

addrspace(1)* undef, i64 %tmp8

  store i8 addrspace(1)* undef, i8 addrspace(1)* addrspace(1)* %tmp9, align 8

  %tmp10 = add nuw nsw i64 %tmp, 6

  %tmp11 = icmp ult i64 %tmp10, 302

  br i1 %tmp11, label %bb1, label %bb12

bb12:                                             ; preds = %bb1

  unreachable

}

attributes #0 = { uwtable "target-cpu"="skylake-avx512"

"target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,+xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,+clwb,+avx512f,-clzero,+pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,+avx512bw,+clflushopt,+xsave,-avx512vbmi2,+64bit,+avx512vl,+invpcid,+avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,+avx512dq,+adx,-avx512pf,+sse3"

}

!0 = !{!"function_entry_count", i64 32768}

To reproduce on ToT: opt -loop-vectorize repro.ll -S 

--------------------------

The vectorized code contains the following snippet:

%induction = add <8 x i64> %broadcast.splat, <i64 0, i64 6, i64 12, i64 18, i64

24, i64 30, i64 36, i64 42>

  %1 = add i64 %offset.idx, 0

  %2 = lshr exact i64 %1, 1

  %3 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)*

undef, i64 %2

  %4 = add nuw nsw i64 %1, 2

  %5 = lshr exact i64 %4, 1

  %6 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)*

undef, i64 %5

  %7 = add nuw nsw i64 %1, 4

  %8 = lshr exact i64 %7, 1

  %9 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)*

undef, i64 %8

  %10 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)*

%9, i32 -2

  %11 = bitcast i8 addrspace(1)* addrspace(1)* %10 to <24 x i8 addrspace(1)*>

addrspace(1)*

  store <24 x i8 addrspace(1)*> undef, <24 x i8 addrspace(1)*> addrspace(1)*

%11, align 8

  %index.next = add i64 %index, 8

  %12 = icmp eq i64 %index.next, 48

As we can see the VF = 8 for the loop, but we generate <24 x i8 addrspace(1)*>.

We should be generating <8 x i8 addrspace(1)*> vectors instead. 

LV debug shows:

LV: We can vectorize this loop!

LV: The Smallest and Widest types: 64 / 64 bits.

LV: The Widest register safe to use is: 512 bits.

...

LV: Vector loop of width 8 costs: 2.

LV: Selecting VF: 8.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>