[llvm-bugs] [Bug 39414] New: Poor code generation for NEON unaligned loads/stores

Tue Oct 23 16:42:15 PDT 2018

https://bugs.llvm.org/show_bug.cgi?id=39414

            Bug ID: 39414
           Summary: Poor code generation for NEON unaligned loads/stores
           Product: libraries
           Version: 7.0
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: ARM
          Assignee: unassignedbugs at nondot.org
          Reporter: fabiang at radgametools.com
                CC: llvm-bugs at lists.llvm.org

Sample repro on Godbolt:

https://godbolt.org/z/itWjaD

Here's the (C++) code verbatim:

----

#include <arm_neon.h>

typedef uint32_t U32unalign __attribute__((aligned(1)));

void f_unalign(void *p, uint8x8_t v)
{
    vst1_lane_u32((U32unalign *) p, vreinterpret_u32_u8(v), 0);
}

void f_align(void *p, uint8x8_t v)
{
    vst1_lane_u32((uint32_t *) p, vreinterpret_u32_u8(v), 0);
}

uint8x8_t g_unalign(const void *p)
{
    return vld1_dup_u32((U32unalign *)p);
}

uint8x8_t g_align(const void *p)
{
    return vld1_dup_u32((uint32_t *)p);
}

----

clang -target armv7a-none-eabi -O2 -S produces:

----

f_unalign(void*, __simd64_uint8_t):
        vmov    d16, r2, r3
        vmov.32 r1, d16[0]
        strb    r1, [r0]
        lsr     r2, r1, #24
        strb    r2, [r0, #3]
        lsr     r2, r1, #16
        lsr     r1, r1, #8
        strb    r2, [r0, #2]
        strb    r1, [r0, #1]
        bx      lr

f_align(void*, __simd64_uint8_t):
        vmov    d16, r2, r3
        vst1.32 {d16[0]}, [r0:32]
        bx      lr

g_unalign(void const*):
        ldrb    r1, [r0]
        ldrb    r2, [r0, #1]
        ldrb    r3, [r0, #2]
        ldrb    r0, [r0, #3]
        orr     r1, r1, r2, lsl #8
        orr     r0, r3, r0, lsl #8
        orr     r0, r1, r0, lsl #16
        vdup.32 d16, r0
        vmov    r0, r1, d16
        bx      lr

g_align(void const*):
        vld1.32 {d16[]}, [r0:32]
        vmov    r0, r1, d16
        bx      lr

----

The code I would expect to see for the unaligned variants is:

----

f_unalign(void*, __simd64_uint8_t):
        vmov    d16, r2, r3
        vst1.32 {d16[0]}, [r0] ; same as f_align except for :32 alignment
specifier
        bx      lr

g_unalign(void const*):
        vld1.32 {d16[]}, [r0] ; same here
        vmov    r0, r1, d16
        bx      lr

----

This is especially awkward with the NEON intrinsics because the single-lane
variants of vst1.32 (or .16) can be used to either:

- Store a single 32-bit lane (the intrinsics are OK for this)
- Store contiguous 32 bits (or 16 bits) of a narrower type, which the
intrinsics can't really express, requiring awkward workarounds like the
"U32unalign" construction above.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20181023/ddd749cf/attachment.html>