[llvm-bugs] [Bug 45415] New: [VectorCombine?] Inversed <4 x i32> + @llvm.bswap.v4i32 = Inversed <16 x i8>

Fri Apr 3 02:38:38 PDT 2020

https://bugs.llvm.org/show_bug.cgi?id=45415

            Bug ID: 45415
           Summary: [VectorCombine?] Inversed <4 x i32> +
                    @llvm.bswap.v4i32 = Inversed <16 x i8>
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: lebedev.ri at gmail.com
                CC: llvm-bugs at lists.llvm.org

I was actually initially looking whether the pattern is vectorized at all, 
but it is, and the ir //seems// slightly suboptimal still:

https://godbolt.org/z/CXk4bt

#include <cstdint>
#include <cstring>
#include <array>

void test(char* __restrict__ in, char* __restrict__ out) {
    for(int i = 0; i != 16; i += 4) {
        uint32_t tmp;
        memcpy(&tmp, in + (16-4) - i, sizeof(uint32_t));
        tmp = __builtin_bswap32(tmp);
        memcpy(out + i, &tmp, sizeof(uint32_t));
    }
}

Currently results in 

; Function Attrs: nofree nounwind uwtable
define dso_local void @_Z4testPcS_(i8* noalias nocapture readonly %in, i8*
noalias nocapture %out) local_unnamed_addr #0 {
entry:
  %0 = bitcast i8* %in to <4 x i32>*
  %1 = load <4 x i32>, <4 x i32>* %0, align 1
  %reorder_shuffle = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32>
<i32 3, i32 2, i32 1, i32 0>
  %2 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %reorder_shuffle)
  %3 = bitcast i8* %out to <4 x i32>*
  store <4 x i32> %2, <4 x i32>* %3, align 1
  ret void
}

Which is lowered as

.LCPI0_0:
        .byte   15                      # 0xf
        .byte   14                      # 0xe
        .byte   13                      # 0xd
        .byte   12                      # 0xc
        .byte   11                      # 0xb
        .byte   10                      # 0xa
        .byte   9                       # 0x9
        .byte   8                       # 0x8
        .byte   7                       # 0x7
        .byte   6                       # 0x6
        .byte   5                       # 0x5
        .byte   4                       # 0x4
        .byte   3                       # 0x3
        .byte   2                       # 0x2
        .byte   1                       # 0x1
        .byte   0                       # 0x0
test(char*, char*):                            # @test(char*, char*)
        vmovdqu xmm0, xmmword ptr [rdi]
        vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 =
xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
        vmovdqu xmmword ptr [rsi], xmm0
        ret

So we could combine those two shuffles in IR too:

; Function Attrs: nofree nounwind uwtable
define dso_local void @_Z4testPcS_(i8* noalias nocapture readonly %in, i8*
noalias nocapture %out) local_unnamed_addr #0 {
entry:
  %t0 = bitcast i8* %in to <16 x i8>*
  %t1 = bitcast i8* %out to <16 x i8>*
  %t2 = load <16 x i8>, <16 x i8>* %t0, align 1
  %t3 = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32>
                      <i32 15, i32 14, i32 13, i32 12,
                       i32 11, i32 10,  i32 9,  i32 8,
                        i32 7,  i32 6,  i32 5,  i32 4,
                        i32 3,  i32 2,  i32 1,  i32 0>
  store <16 x i8> %t3, <16 x i8>* %t1, align 1
  ret void
}

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20200403/affda705/attachment-0001.html>