[PATCH] D56118: [ARM]: Add optimized NEON uint64x2_t multiply routine.
easyaspi314 (Devin) via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 2 18:01:36 PST 2019
easyaspi314 added a comment.
This is what I have now. Constant swapping and twomul is now implemented, and I will update the patch once I finish the documentation and tests, as well as double-check the return values on my phone.
typedef unsigned long long v2i64 __attribute__((vector_size(16)));
v2i64 mult(v2i64 v1, v2i64 v2) {
return v1 * v2;
}
v2i64 mult_lo(v2i64 v1, v2i64 v2) {
return (v1 & 0xFFFFFFFF) * (v2 & 0xFFFFFFFF);
}
v2i64 mult_lo_lohi(v2i64 v1, v2i64 v2) {
return (v1 & 0xFFFFFFFF) * v2;
}
v2i64 mult_constant(v2i64 v1) {
return 1234567889904ULL * v1;
}
v2i64 mult_lo_constant(v2i64 v1) {
return v1 * 1234ULL;
}
mult:
vmov d17, r2, r3
mov r12, sp
vmov d16, r0, r1
vld1.64 {d18, d19}, [r12]
vrev64.32 q10, q8
vmovn.i64 d16, q8
vmovn.i64 d17, q9
vmul.i32 q10, q10, q9
vpaddl.u32 q10, q10
vshl.i64 q9, q10, #32
vmlal.u32 q9, d17, d16
vmov r0, r1, d18
vmov r2, r3, d19
bx lr
mult_lo:
vmov d19, r2, r3
vmov.i64 q8, #0xffffffff
vmov d18, r0, r1
mov r0, sp
vand q9, q9, q8
vld1.64 {d20, d21}, [r0]
vand q8, q10, q8
vmovn.i64 d18, q9
vmovn.i64 d16, q8
vmull.u32 q8, d16, d18
vmov r0, r1, d16
vmov r2, r3, d17
bx lr
mult_lo_lohi:
vmov d19, r2, r3
vmov.i64 q8, #0xffffffff
vmov d18, r0, r1
mov r0, sp
vld1.64 {d20, d21}, [r0]
vand q8, q9, q8
vshrn.i64 d18, q10, #32
vmovn.i64 d16, q8
vmovn.i64 d17, q10
vmull.u32 q9, d16, d18
vshl.i64 q9, q9, #32
vmlal.u32 q9, d16, d17
vmov r0, r1, d18
vmov r2, r3, d19
bx lr
mult_constant:
adr r12, .LCPI3_0
vmov d17, r2, r3
vld1.64 {d18, d19}, [r12:128]
vmov d16, r0, r1
vmul.i32 q9, q8, q9
vldr d20, .LCPI3_1
vmovn.i64 d16, q8
vpaddl.u32 q9, q9
vshl.i64 q9, q9, #32
vmlal.u32 q9, d16, d20
vmov r0, r1, d18
vmov r2, r3, d19
bx lr
.LCPI3_0:
.long 1912275952
.long 1912275952
.long 287
.long 287
.LCPI3_1:
.long 1912275952
.long 1912275952
mult_lo_constant:
vmov d17, r2, r3
vldr d18, .LCPI4_0
vmov d16, r0, r1
vshrn.i64 d19, q8, #32
vmovn.i64 d16, q8
vmull.u32 q10, d19, d18
vshl.i64 q10, q10, #32
vmlal.u32 q10, d16, d18
vmov r0, r1, d20
vmov r2, r3, d21
bx lr
.LCPI4_0:
.long 1234
.long 1234
If someone can optimize that redundant load in `mult_constant`, I would appreciate it.
This is the code I want:
mult_constant:
adr r12, .LCPI3_0
vmov d17, r2, r3
vld1.64 {d22, d23}, [r12:128]
vmov d16, r0, r1
vmul.i32 q9, q8, q11
vmovn.i64 d16, q8
vpaddl.u32 q9, q9
vshl.i64 q9, q9, #32
vmlal.u32 q9, d16, d22
vmov r0, r1, d18
vmov r2, r3, d19
bx lr
.LCPI3_0:
.long 1912275952
.long 1912275952
.long 287
.long 287
Repository:
rL LLVM
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D56118/new/
https://reviews.llvm.org/D56118
More information about the llvm-commits
mailing list