<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - [VectorCombine?] Inversed <4 x i32> + @llvm.bswap.v4i32 = Inversed <16 x i8>"

   href="https://bugs.llvm.org/show_bug.cgi?id=45415">45415</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[VectorCombine?] Inversed <4 x i32> + @llvm.bswap.v4i32 = Inversed <16 x i8>

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>lebedev.ri@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>I was actually initially looking whether the pattern is vectorized at all, 

but it is, and the ir //seems// slightly suboptimal still:

<a href="https://godbolt.org/z/CXk4bt">https://godbolt.org/z/CXk4bt</a>

#include <cstdint>

#include <cstring>

#include <array>

void test(char* __restrict__ in, char* __restrict__ out) {

    for(int i = 0; i != 16; i += 4) {

        uint32_t tmp;

        memcpy(&tmp, in + (16-4) - i, sizeof(uint32_t));

        tmp = __builtin_bswap32(tmp);

        memcpy(out + i, &tmp, sizeof(uint32_t));

    }

}

Currently results in 

; Function Attrs: nofree nounwind uwtable

define dso_local void @_Z4testPcS_(i8* noalias nocapture readonly %in, i8*

noalias nocapture %out) local_unnamed_addr #0 {

entry:

  %0 = bitcast i8* %in to <4 x i32>*

  %1 = load <4 x i32>, <4 x i32>* %0, align 1

  %reorder_shuffle = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32>

<i32 3, i32 2, i32 1, i32 0>

  %2 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %reorder_shuffle)

  %3 = bitcast i8* %out to <4 x i32>*

  store <4 x i32> %2, <4 x i32>* %3, align 1

  ret void

}

Which is lowered as

.LCPI0_0:

        .byte   15                      # 0xf

        .byte   14                      # 0xe

        .byte   13                      # 0xd

        .byte   12                      # 0xc

        .byte   11                      # 0xb

        .byte   10                      # 0xa

        .byte   9                       # 0x9

        .byte   8                       # 0x8

        .byte   7                       # 0x7

        .byte   6                       # 0x6

        .byte   5                       # 0x5

        .byte   4                       # 0x4

        .byte   3                       # 0x3

        .byte   2                       # 0x2

        .byte   1                       # 0x1

        .byte   0                       # 0x0

test(char*, char*):                            # @test(char*, char*)

        vmovdqu xmm0, xmmword ptr [rdi]

        vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 =

xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]

        vmovdqu xmmword ptr [rsi], xmm0

        ret

So we could combine those two shuffles in IR too:

; Function Attrs: nofree nounwind uwtable

define dso_local void @_Z4testPcS_(i8* noalias nocapture readonly %in, i8*

noalias nocapture %out) local_unnamed_addr #0 {

entry:

  %t0 = bitcast i8* %in to <16 x i8>*

  %t1 = bitcast i8* %out to <16 x i8>*

  %t2 = load <16 x i8>, <16 x i8>* %t0, align 1

  %t3 = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32>

                      <i32 15, i32 14, i32 13, i32 12,

                       i32 11, i32 10,  i32 9,  i32 8,

                        i32 7,  i32 6,  i32 5,  i32 4,

                        i32 3,  i32 2,  i32 1,  i32 0>

  store <16 x i8> %t3, <16 x i8>* %t1, align 1

  ret void

}</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>