<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - [LoopDataPrefetch] - places prefetches between a load and its single user, which disrupts instruction selection."

   href="https://bugs.llvm.org/show_bug.cgi?id=34883">34883</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[LoopDataPrefetch]  - places prefetches between a load and its single user, which disrupts instruction selection.

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>paulsson@linux.vnet.ibm.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Created <span class=""><a href="attachment.cgi?id=19250" name="attach_19250" title="reduced testcase">attachment 19250</a> <a href="attachment.cgi?id=19250&action=edit" title="reduced testcase">[details]</a></span>

reduced testcase

On SystemZ, it is good to utilize the vector load element instruction whenever

possible, which can load from memory and insert into a vector element with a

single instruction.

In this test case, there are four loads followed by four insertelement

instructions that together load a vector with four 32bit elements. Without the

LoopDataPrefetch pass, this is selected into VLEFs (the first one is a VLREP

though, as expected), but with the prefetches this does not happen.

  %15 = load i32, i32* %11, align 4, !tbaa !1

  %16 = load i32, i32* %12, align 4, !tbaa !1

  %17 = load i32, i32* %13, align 4, !tbaa !1

  %18 = load i32, i32* %14, align 4, !tbaa !1

  %19 = insertelement <4 x i32> undef, i32 %15, i32 0

  %20 = insertelement <4 x i32> %19, i32 %16, i32 1

  %21 = insertelement <4 x i32> %20, i32 %17, i32 2

  %22 = insertelement <4 x i32> %21, i32 %18, i32 3

=> LoopDataPrefetch pass

  call void @llvm.prefetch(i8* %scevgep1, i32 0, i32 3, i32 1)

  %23 = load i32, i32* %19, align 4, !tbaa !1

  call void @llvm.prefetch(i8* %scevgep23, i32 0, i32 3, i32 1)

  %24 = load i32, i32* %20, align 4, !tbaa !1

  call void @llvm.prefetch(i8* %scevgep45, i32 0, i32 3, i32 1)

  %25 = load i32, i32* %21, align 4, !tbaa !1

  call void @llvm.prefetch(i8* %scevgep67, i32 0, i32 3, i32 1)

  %26 = load i32, i32* %22, align 4, !tbaa !1

  %27 = insertelement <4 x i32> undef, i32 %23, i32 0

  %28 = insertelement <4 x i32> %27, i32 %24, i32 1

  %29 = insertelement <4 x i32> %28, i32 %25, i32 2

  %30 = insertelement <4 x i32> %29, i32 %26, i32 3

It seems that the prefetches are placed before each load, but this is not good

enough in this case as this is a sequence of several loads.

The DAG then looks like:

Optimized legalized selection DAG: BB#1 'BZ2_blockSort:vector.body210'

SelectionDAG has 79 nodes:

  t0: ch = EntryToken

  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg1

      t10: i64 = add t2, Constant:i64<163840>

    t116: ch = SystemZISD::PREFETCH<LD1[%scevgep13]> t0, Constant:i32<1>, t10

  t23: i32,ch = load<LD4[%lsr.iv](tbaa=<0x52db148>)> t116, t2, undef:i64

      t8: i64 = add t2, Constant:i64<164864>

    t115: ch = SystemZISD::PREFETCH<LD1[%scevgep12]> t23:1, Constant:i32<1>, t8

    t12: i64 = add t2, Constant:i64<1024>

  t25: i32,ch = load<LD4[%scevgep19](tbaa=<0x52db148>)> t115, t12, undef:i64

      t6: i64 = add t2, Constant:i64<165888>

    t114: ch = SystemZISD::PREFETCH<LD1[%scevgep11]> t25:1, Constant:i32<1>, t6

    t14: i64 = add t2, Constant:i64<2048>

  t27: i32,ch = load<LD4[%scevgep17](tbaa=<0x52db148>)> t114, t14, undef:i64

    t45: i64 = add t2, Constant:i64<4>

  t96: i32,ch = load<LD4[%scevgep20](tbaa=<0x52db148>)> t113, t45, undef:i64

    t47: i64 = add t2, Constant:i64<1028>

  t93: i32,ch = load<LD4[%scevgep18](tbaa=<0x52db148>)> t113, t47, undef:i64

    t49: i64 = add t2, Constant:i64<2052>

  t90: i32,ch = load<LD4[%scevgep16](tbaa=<0x52db148>)> t113, t49, undef:i64

    t51: i64 = add t2, Constant:i64<3076>

  t87: i32,ch = load<LD4[%scevgep14](tbaa=<0x52db148>)> t113, t51, undef:i64

    t16: i64 = add t2, Constant:i64<3072>

  t29: i32,ch = load<LD4[%scevgep15](tbaa=<0x52db148>)> t113, t16, undef:i64

    t4: i64 = add t2, Constant:i64<166912>

  t113: ch = SystemZISD::PREFETCH<LD1[%scevgep10]> t27:1, Constant:i32<1>, t4

  t122: v4i32 = SystemZISD::ROTATE_MASK Constant:i32<11>, Constant:i32<9>

          t40: i64,ch = CopyFromReg t0, Register:i64 %vreg2

        t66: i64 = add t40, Constant:i64<4>

      t68: ch = CopyToReg t0, Register:i64 %vreg3, t66

        t70: i64 = add t2, Constant:i64<4096>

      t72: ch = CopyToReg t0, Register:i64 %vreg4, t70

          t74: i64,ch = CopyFromReg t0, Register:i64 %vreg0

        t76: i64 = add t74, Constant:i64<-4>

      t78: ch = CopyToReg t0, Register:i64 %vreg5, t76

                    t104: v4i32 = SystemZISD::REPLICATE t23

                  t105: v4i32 = insert_vector_elt t104, t25, Constant:i32<1>

                t107: v4i32 = insert_vector_elt t105, t27, Constant:i32<2>

              t108: v4i32 = insert_vector_elt t107, t29, Constant:i32<3>

            t38: v4i32 = and t108, t122

          t43: ch = store<ST16[undef](align=4)(tbaa=<0x52db148>)> t29:1, t38,

undef:i64, undef:i64

        t98: ch = TokenFactor t87:1, t90:1, t93:1, t43, t96:1

                  t109: v4i32 = SystemZISD::REPLICATE t96

                t110: v4i32 = insert_vector_elt t109, t93, Constant:i32<1>

              t111: v4i32 = insert_vector_elt t110, t90, Constant:i32<2>

            t112: v4i32 = insert_vector_elt t111, t87, Constant:i32<3>

          t60: v4i32 = and t112, t122

            t118: v16i8 = SystemZISD::BYTE_MASK Constant:i32<65535>

          t119: v4i32 = bitcast t118

        t63: v4i32 = add t60, t119

      t65: ch = store<ST16[undef](align=4)(tbaa=<0x52db148>)> t98, t63,

undef:i64, undef:i64

    t80: ch = TokenFactor t68, t72, t78, t65

  t81: ch = br t80, BasicBlock:ch<vector.body210 0x53366e8>

It seems that the pattern matcher for VLEF fails because, each prefetch node is

chained between the loads for the vector elements. Without the prefetch nodes,

the loads are not chained and the pattern matcher succeeds.

llc -mtriple=s390x-linux-gnu -mcpu=z13 tc_pfd.ll</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>