<html>
    <head>
      <base href="https://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - Terrible ARM shuffle lowering for extend from <2 x i8> to <8 x i8>"
   href="https://llvm.org/bugs/show_bug.cgi?id=31283">31283</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Terrible ARM shuffle lowering for extend from <2 x i8> to <8 x i8>
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Windows NT
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: ARM
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>efriedma@codeaurora.org
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Testcase:

define <8 x i8> @f(<2 x i8> *%p) {
  %t = load <2 x i8>, <2 x i8> *%p
  %r = shufflevector <2 x i8> %t, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32
2, i32 2, i32 2, i32 2, i32 2, i32 2>
  ret <8 x i8> %r
}

With llc -mtriple=armv7--linux-gnueabihf:

        vld1.16 {d16[0]}, [r0:16]
        vldr    d18, .LCPI0_0
        vmovl.u8        q8, d16
        vmovl.u16       q8, d16
        vtbl.8  d0, {d16}, d18
        bx      lr

The first instruction is great... vld1.16 produces exactly the result we want. 
The problem is the following four instructions, which add up to an identity
shuffle.  I think what's happening is that we treat vld1.16+vmovl.u8+vmovl.u16
as a single, legal DAG node ("load<LD2[%p], anyext from v2i8>"), so shuffle
combining never tries to eliminate the extra shuffles.  Excerpts from
SelectionDAG debug output:


Optimized type-legalized selection DAG: BB#0 'f:'
SelectionDAG has 14 nodes:
  t0: ch = EntryToken
        t18: i32 = extract_vector_elt t16, Constant:i32<0>
        t21: i32 = extract_vector_elt t16, Constant:i32<1>
      t24: v8i8 = BUILD_VECTOR t18, t21, undef:i32, undef:i32, undef:i32,
undef:i32, undef:i32, undef:i32
    t9: f64 = bitcast t24
  t11: ch,glue = CopyToReg t0, Register:f64 %D0, t9
    t2: i32,ch = CopyFromReg t0, Register:i32 %vreg0
  t16: v2i32,ch = load<LD2[%p], anyext from v2i8> t0, t2, undef:i32
  t12: ch = ARMISD::RET_FLAG t11, Register:f64 %D0, t11:1


Legalized selection DAG: BB#0 'f:'
SelectionDAG has 15 nodes:
  t0: ch = EntryToken
            t2: i32,ch = CopyFromReg t0, Register:i32 %vreg0
          t16: v2i32,ch = load<LD2[%p], anyext from v2i8> t0, t2, undef:i32
        t25: v8i8 = bitcast t16
            t38: i32 = ARMISD::Wrapper TargetConstantPool:i32<<8 x i8> <i8 0,
i8 4, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>> 0
          t35: f64,ch = load<LD8[ConstantPool]> t0, t38, undef:i32
        t36: v8i8 = bitcast t35
      t32: v8i8 = ARMISD::VTBL1 t25, t36
    t9: f64 = bitcast t32
  t11: ch,glue = CopyToReg t0, Register:f64 %D0, t9
  t12: ch = ARMISD::RET_FLAG t11, Register:f64 %D0, t11:1</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>