[LLVMbugs] [Bug 15525] New: SROA pessimizes ARM int8x16x2_t function argument handling
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Fri Mar 15 10:52:00 PDT 2013
http://llvm.org/bugs/show_bug.cgi?id=15525
Bug ID: 15525
Summary: SROA pessimizes ARM int8x16x2_t function argument
handling
Product: libraries
Version: trunk
Hardware: PC
OS: All
Status: NEW
Severity: enhancement
Priority: P
Component: Scalar Optimizations
Assignee: unassignedbugs at nondot.org
Reporter: grosbach at apple.com
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
$ cat foo.c
#include <arm_neon.h>
typedef uint8_t __attribute__((aligned(2))) uint8_a16;
void Store(uint8_a16 *dst, uint8x16x2_t vec) {
vst1q_u8(dst, vec.val[0]);
}
With old SROA:
target triple = "thumbv7s-apple-ios7.0.0"
define void @Store(i8* %dst, [4 x i64] %vec.coerce) #0 {
entry:
%0 = extractvalue [4 x i64] %vec.coerce, 0
%1 = zext i64 %0 to i128
%2 = extractvalue [4 x i64] %vec.coerce, 1
%3 = zext i64 %2 to i128
%4 = shl nuw i128 %3, 64
%ins2 = or i128 %4, %1
%5 = bitcast i128 %ins2 to <16 x i8>
tail call void @llvm.arm.neon.vst1.v16i8(i8* %dst, <16 x i8> %5, i32 2)
ret void
}
With new SROA:
target triple = "thumbv7s-apple-ios7.0.0"
define void @Store(i8* %dst, [4 x i64] %vec.coerce) #0 {
entry:
%vec.coerce.fca.0.extract = extractvalue [4 x i64] %vec.coerce, 0
%0 = bitcast i64 %vec.coerce.fca.0.extract to <8 x i8>
%vec.sroa.0.0.vecinsert = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32
undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%vec.coerce.fca.1.extract = extractvalue [4 x i64] %vec.coerce, 1
%1 = bitcast i64 %vec.coerce.fca.1.extract to <8 x i8>
%vec.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x
i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vec.sroa.0.8.vecinsert = shufflevector <16 x i8> %vec.sroa.0.8.vec.expand,
<16 x i8> %vec.sroa.0.0.vecinsert, <16 x i32> <i32 16, i32 17, i32 18, i32 19,
i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13,
i32 14, i32 15>
tail call void @llvm.arm.neon.vst1.v16i8(i8* %dst, <16 x i8>
%vec.sroa.0.8.vecinsert, i32 2)
ret void
}
When we run each through codegen we get vastly different results. We want (and
the old sroa gives us):
_Store:
@ BB#0: @ %entry
vmov d1, r3, r3
vmov d2, r2, r2
vldr s3, [sp]
vmov d0, r1, r1
vmov.f32 s1, s4
vst1.8 {d0, d1}, [r0]
bx
We now get:
_Store:
@ BB#0: @ %entry
push {r4, r7, lr}
add r7, sp, #4
sub sp, #20
mov r4, sp
bic r4, r4, #15
mov sp, r4
mov r9, sp
vmov d16, r1, r2
ldr r2, [r7, #8]
subs r4, r7, #4
orr r1, r9, #7
vst1.8 {d16[7]}, [r1]
orr r1, r9, #6
vst1.8 {d16[6]}, [r1]
orr r1, r9, #5
vst1.8 {d16[5]}, [r1]
orr r1, r9, #4
vst1.8 {d16[4]}, [r1]
orr r1, r9, #3
vst1.8 {d16[3]}, [r1]
orr r1, r9, #2
vst1.8 {d16[2]}, [r1]
orr r1, r9, #1
vst1.8 {d16[1]}, [r1]
vst1.8 {d16[0]}, [r9]
orr r1, r9, #15
vmov d16, r3, r2
vst1.8 {d16[7]}, [r1]
orr r1, r9, #14
vst1.8 {d16[6]}, [r1]
orr r1, r9, #13
vst1.8 {d16[5]}, [r1]
orr r1, r9, #12
vst1.8 {d16[4]}, [r1]
orr r1, r9, #11
vst1.8 {d16[3]}, [r1]
orr r1, r9, #10
vst1.8 {d16[2]}, [r1]
orr r1, r9, #9
vst1.8 {d16[1]}, [r1]
orr r1, r9, #8
vst1.8 {d16[0]}, [r1]
vld1.64 {d16, d17}, [r9:128]
vst1.8 {d16, d17}, [r0]
mov sp, r4
pop {r4, r7, pc}
Now, it's not unreasonable to say that the ARM backend can and should do better
with the new IR; however, the IR isn't exactly making it easy. Consider the
final merge of the sub-vectors:
%vec.sroa.0.8.vec.expand = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x
i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vec.sroa.0.8.vecinsert = shufflevector <16 x i8> %vec.sroa.0.8.vec.expand,
<16 x i8> %vec.sroa.0.0.vecinsert, <16 x i32> <i32 16, i32 17, i32 18, i32 19,
i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13,
i32 14, i32 15>
The element ordering here is unusual, to say the least, and it's not all that
surprising the backend is punting to a generic expansion.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20130315/04b3af0a/attachment.html>
More information about the llvm-bugs
mailing list