<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - Bad codegen for vbslq_u32() intrinsic"
   href="https://bugs.llvm.org/show_bug.cgi?id=49961">49961</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Bad codegen for vbslq_u32() intrinsic
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: AArch64
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>mkuper@google.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>arnaud.degrandmaison@arm.com, llvm-bugs@lists.llvm.org, smithp352@googlemail.com, Ties.Stuij@arm.com
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Consider:

int foo(uint32x4x2_t reg, uint32x4_t mask, int index) {
  return vbslq_u32(mask, reg.val[0], reg.val[1])[index];
}

clang vs gcc: <a href="https://gcc.godbolt.org/z/YPe3TK79P">https://gcc.godbolt.org/z/YPe3TK79P</a>

clang trunk:
foo(uint32x4x2_t, __Uint32x4_t, int):    // @foo(uint32x4x2_t, __Uint32x4_t,
int)
        sub     sp, sp, #48                     // =48
        and     x8, x0, #0x3
        add     x10, sp, #32                    // =32
        str     q1, [sp, #32]
        mov     x9, sp
        add     x11, sp, #16                    // =16
        bfi     x10, x8, #2, #2
        and     v0.16b, v0.16b, v2.16b
        bfi     x9, x8, #2, #2
        bfi     x11, x8, #2, #2
        ldr     w8, [x10]
        str     q2, [sp, #16]
        ldr     w10, [x11]
        str     q0, [sp]
        ldr     w9, [x9]
        bic     w8, w8, w10
        orr     w0, w8, w9
        add     sp, sp, #48                     // =48
        ret

gcc trunk:
foo(uint32x4x2_t, __Uint32x4_t, int):
        bsl     v2.16b, v0.16b, v1.16b
        sub     sp, sp, #16
        str     q2, [sp]
        ldr     w0, [sp, w0, sxtw 2]
        add     sp, sp, 16
        ret

>From a cursory examination of what's going on, clang lowers vbslq_u32(mask, a,
b) to a vector "or(and(a, mask), and(b,~mask))", which the backend expects to
match. However, in this case, something in the midend decides it's best to
first extract the elements at "index" from both vectors, and then do the
or(and(), and()) song-and-dance in the scalar domain.</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>