<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1>"

   href="https://bugs.llvm.org/show_bug.cgi?id=38840">38840</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>sub-optimal codegen for llvm.experimental.vector.reduce of <N x i1>

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>gonzalobg88@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com

          </td>

        </tr></table>

      <p>

        <div>

        <pre>The llvm.experimental.vector.reduce.{and,or,xor} instructions of the x86

backend produce very sub-optimal machine code. See it live:

<a href="https://gcc.godbolt.org/z/qIHi6D">https://gcc.godbolt.org/z/qIHi6D</a>

LLVM-IR:

declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1>);

declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1>);

declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1>);

declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1>);

define i1 @and128_x2(<2 x i64>) {

    %a = trunc <2 x i64> %0 to <2 x i1>

    %b = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a)

    ret i1 %b

}

define i1 @and_x4(<4 x i32>) {

    %a = trunc <4 x i32> %0 to <4 x i1>

    %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)

    ret i1 %b

}

define i1 @and128_x8(<8 x i8>) {

    %a = trunc <8 x i8> %0 to <8 x i1>

    %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)

    ret i1 %b

}

define i1 @and256_x4(<4 x i64>) {

    %a = trunc <4 x i64> %0 to <4 x i1>

    %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a)

    ret i1 %b

}

define i1 @and_x8(<8 x i32>) {

    %a = trunc <8 x i32> %0 to <8 x i1>

    %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a)

    ret i1 %b

}

define i1 @and256_x32(<32 x i8>) {

    %a = trunc <32 x i8> %0 to <32 x i1>

    %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a)

    ret i1 %b

}

produces

and128_x2: # @and128_x2

  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]

  pand %xmm0, %xmm1

  movd %xmm1, %eax

  retq

and_x4: # @and_x4

  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]

  pand %xmm0, %xmm1

  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]

  pand %xmm1, %xmm0

  movd %xmm0, %eax

  retq

and128_x8: # @and128_x8

  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]

  pand %xmm0, %xmm1

  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]

  pand %xmm1, %xmm0

  movdqa %xmm0, %xmm1

  psrld $16, %xmm1

  pand %xmm0, %xmm1

  movd %xmm1, %eax

  retq

and256_x4: # @and256_x4

  shufps $136, %xmm1, %xmm0 # xmm0 = xmm0[0,2],xmm1[0,2]

  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]

  pand %xmm0, %xmm1

  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]

  pand %xmm1, %xmm0

  movd %xmm0, %eax

  retq

and256_x8: # @and_x8

  pshuflw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3,4,5,6,7]

  pshufhw $232, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2,3,4,6,6,7]

  pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3]

  pshuflw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3,4,5,6,7]

  pshufhw $232, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2,3,4,6,6,7]

  pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]

  punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0]

  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]

  pand %xmm0, %xmm1

  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]

  pand %xmm1, %xmm0

  movdqa %xmm0, %xmm1

  psrld $16, %xmm1

  pand %xmm0, %xmm1

  movd %xmm1, %eax

  retq

and256_x32: # @and256_x32

  pand %xmm1, %xmm0

  pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]

  pand %xmm0, %xmm1

  pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]

  pand %xmm1, %xmm0

  movdqa %xmm0, %xmm1

  psrld $16, %xmm1

  pand %xmm0, %xmm1

  movdqa %xmm1, %xmm0

  psrlw $8, %xmm0

  pand %xmm1, %xmm0

  movd %xmm0, %eax

  retq

but these should all lower to a single mvmsk instruction:

and128_x2:

  movmskpd %xmm0, %eax

  retq

and128_x4:

  movmskps %xmm0, %eax

  retq

and128_x8:

  pmovmskb %xmm0, %eax

  retq

and256_x4:

  vmovmskpd %ymm0, %eax

  vzeroupper

  retq

and256_x8:

  vmovmskps %ymm0, %eax

  vzeroupper

  retq

and256_x32:

  vpmovmskb %ymm0, %eax

  vzeroupper

  retq1

The llvm.experimental.vector.reduce.and for <8 x i16>, <16 x i16>, <1 x i128>,

<2 x i128>, etc. probably produce very sub-optimal machine code for i1 vectors

as well. 

The llvm.experimental.vector.reduce.or and llvm.experimental.vector.reduce.xor

probably produce very sub-optimal machine code for all these i1 vectors too.

These llvm intrinsics are critical for efficiently performing coherent control

flow.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>