[LLVMbugs] [Bug 15525] New: SROA pessimizes ARM int8x16x2_t function argument handling

Fri Mar 15 10:52:00 PDT 2013

http://llvm.org/bugs/show_bug.cgi?id=15525

            Bug ID: 15525
           Summary: SROA pessimizes ARM int8x16x2_t function argument
                    handling
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: grosbach at apple.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

$ cat foo.c
#include <arm_neon.h>
typedef uint8_t __attribute__((aligned(2))) uint8_a16;
void Store(uint8_a16 *dst, uint8x16x2_t vec) {
  vst1q_u8(dst, vec.val[0]);
}

With old SROA:
target triple = "thumbv7s-apple-ios7.0.0"

define void @Store(i8* %dst, [4 x i64] %vec.coerce) #0 {
entry:
  %0 = extractvalue [4 x i64] %vec.coerce, 0
  %1 = zext i64 %0 to i128
  %2 = extractvalue [4 x i64] %vec.coerce, 1
  %3 = zext i64 %2 to i128
  %4 = shl nuw i128 %3, 64
  %ins2 = or i128 %4, %1
  %5 = bitcast i128 %ins2 to <16 x i8>
  tail call void @llvm.arm.neon.vst1.v16i8(i8* %dst, <16 x i8> %5, i32 2)
  ret void
}

With new SROA:
target triple = "thumbv7s-apple-ios7.0.0"

define void @Store(i8* %dst, [4 x i64] %vec.coerce) #0 {
entry:
  %vec.coerce.fca.0.extract = extractvalue [4 x i64] %vec.coerce, 0
  %0 = bitcast i64 %vec.coerce.fca.0.extract to <8 x i8>
  %vec.sroa.0.0.vecinsert = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32
undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %vec.coerce.fca.1.extract = extractvalue [4 x i64] %vec.coerce, 1
  %1 = bitcast i64 %vec.coerce.fca.1.extract to <8 x i8>
  %vec.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x
i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vec.sroa.0.8.vecinsert = shufflevector <16 x i8> %vec.sroa.0.8.vec.expand,
<16 x i8> %vec.sroa.0.0.vecinsert, <16 x i32> <i32 16, i32 17, i32 18, i32 19,
i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13,
i32 14, i32 15>
  tail call void @llvm.arm.neon.vst1.v16i8(i8* %dst, <16 x i8>
%vec.sroa.0.8.vecinsert, i32 2)
  ret void
}

When we run each through codegen we get vastly different results. We want (and
the old sroa gives us):
_Store:
@ BB#0:                                 @ %entry
    vmov    d1, r3, r3
    vmov    d2, r2, r2
    vldr    s3, [sp]
    vmov    d0, r1, r1
    vmov.f32    s1, s4
    vst1.8    {d0, d1}, [r0]
    bx

We now get:
_Store:
@ BB#0:                                 @ %entry
    push    {r4, r7, lr}
    add    r7, sp, #4
    sub    sp, #20
    mov    r4, sp
    bic    r4, r4, #15
    mov    sp, r4
    mov    r9, sp
    vmov    d16, r1, r2
    ldr    r2, [r7, #8]
    subs    r4, r7, #4
    orr    r1, r9, #7
    vst1.8    {d16[7]}, [r1]
    orr    r1, r9, #6
    vst1.8    {d16[6]}, [r1]
    orr    r1, r9, #5
    vst1.8    {d16[5]}, [r1]
    orr    r1, r9, #4
    vst1.8    {d16[4]}, [r1]
    orr    r1, r9, #3
    vst1.8    {d16[3]}, [r1]
    orr    r1, r9, #2
    vst1.8    {d16[2]}, [r1]
    orr    r1, r9, #1
    vst1.8    {d16[1]}, [r1]
    vst1.8    {d16[0]}, [r9]
    orr    r1, r9, #15
    vmov    d16, r3, r2
    vst1.8    {d16[7]}, [r1]
    orr    r1, r9, #14
    vst1.8    {d16[6]}, [r1]
    orr    r1, r9, #13
    vst1.8    {d16[5]}, [r1]
    orr    r1, r9, #12
    vst1.8    {d16[4]}, [r1]
    orr    r1, r9, #11
    vst1.8    {d16[3]}, [r1]
    orr    r1, r9, #10
    vst1.8    {d16[2]}, [r1]
    orr    r1, r9, #9
    vst1.8    {d16[1]}, [r1]
    orr    r1, r9, #8
    vst1.8    {d16[0]}, [r1]
    vld1.64    {d16, d17}, [r9:128]
    vst1.8    {d16, d17}, [r0]
    mov    sp, r4
    pop    {r4, r7, pc}

Now, it's not unreasonable to say that the ARM backend can and should do better
with the new IR; however, the IR isn't exactly making it easy. Consider the
final merge of the sub-vectors:
  %vec.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x
i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vec.sroa.0.8.vecinsert = shufflevector <16 x i8> %vec.sroa.0.8.vec.expand,
<16 x i8> %vec.sroa.0.0.vecinsert, <16 x i32> <i32 16, i32 17, i32 18, i32 19,
i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13,
i32 14, i32 15>

The element ordering here is unusual, to say the least, and it's not all that
surprising the backend is punting to a generic expansion.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20130315/04b3af0a/attachment.html>