[llvm-bugs] [Bug 41546] New: suboptimal codegen for llvm.x86.addcarryx.u32 and u64
via llvm-bugs
llvm-bugs at lists.llvm.org
Sat Apr 20 02:06:07 PDT 2019
https://bugs.llvm.org/show_bug.cgi?id=41546
Bug ID: 41546
Summary: suboptimal codegen for llvm.x86.addcarryx.u32 and u64
Product: libraries
Version: trunk
Hardware: PC
OS: All
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: gonzalobg88 at gmail.com
CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
llvm-dev at redking.me.uk, spatel+llvm at rotateright.com
MP multiply is documented by Intel as one of the main usecases of addcarryx
intrinsics
(https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf).
We implement this in Rust as:
#[target_feature(enable = "adx,bmi2")]
pub unsafe fn mp_mul_intrin(a: &[u64], b: &[u64], c: &mut [u64]) {
let len = a.len();
if len != b.len() {
unreachable_unchecked();
}
if c.len() != len * 2 {
unreachable_unchecked();
}
for b_i in (0..len).into_iter().rev() {
let b_elem = *b.get_unchecked(b_i);
let mut carry_lo = 0;
let mut carry_hi = 0;
for a_i in (0..len).into_iter().rev() {
let mut hi = 0;
let lo = _mulx_u64(*a.get_unchecked(a_i), b_elem, &mut hi);
carry_lo = _addcarryx_u64(
carry_lo,
lo,
*c.get_unchecked(b_i + a_i + 1),
c.get_unchecked_mut(b_i + a_i + 1),
);
carry_hi = _addcarryx_u64(
carry_hi,
hi,
*c.get_unchecked(b_i + a_i),
c.get_unchecked_mut(b_i + a_i),
);
}
*c.get_unchecked_mut(b_i) += carry_lo as u64;
}
}
which produces the following LLVM-IR after optimizations
(https://rust.godbolt.org/z/EJFEHB):
ource_filename = "example.3a1fbbbh-cgu.0"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @_ZN7example13mp_mul_intrin17ha949a97694eafb34E([0 x i64]* noalias
nocapture nonnull readonly align 8 %a.0, i64 %a.1, [0 x i64]* noalias nocapture
nonnull readonly align 8 %b.0, i64 %b.1, [0 x i64]* nocapture nonnull align 8
%c.0, i64 %c.1) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
start:
%0 = icmp eq i64 %a.1, 0
br i1 %0, label %bb15, label %bb14
bb14: ; preds = %start, %bb23
%iter.sroa.4.040 = phi i64 [ %1, %bb23 ], [ %a.1, %start ]
%1 = add i64 %iter.sroa.4.040, -1
%2 = getelementptr inbounds [0 x i64], [0 x i64]* %b.0, i64 0, i64 %1
%3 = load i64, i64* %2, align 8
%4 = zext i64 %3 to i128
br label %bb22
bb15: ; preds = %bb23, %start
ret void
bb22: ; preds = %bb14, %bb22
%carry_hi.039 = phi i8 [ 0, %bb14 ], [ %24, %bb22 ]
%carry_lo.038 = phi i8 [ 0, %bb14 ], [ %19, %bb22 ]
%iter2.sroa.4.037 = phi i64 [ %a.1, %bb14 ], [ %5, %bb22 ]
%5 = add i64 %iter2.sroa.4.037, -1
%6 = getelementptr inbounds [0 x i64], [0 x i64]* %a.0, i64 0, i64 %5
%7 = load i64, i64* %6, align 8
%8 = zext i64 %7 to i128
%9 = mul nuw i128 %8, %4
%10 = lshr i128 %9, 64
%11 = trunc i128 %10 to i64
%12 = trunc i128 %9 to i64
%13 = add i64 %5, %1
%14 = add i64 %iter2.sroa.4.037, %1
%15 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %14
%16 = load i64, i64* %15, align 8
%17 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_lo.038, i64 %12,
i64 %16) #3
%18 = extractvalue { i8, i64 } %17, 1
store i64 %18, i64* %15, align 8
%19 = extractvalue { i8, i64 } %17, 0
%20 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %13
%21 = load i64, i64* %20, align 8
%22 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_hi.039, i64 %11,
i64 %21) #3
%23 = extractvalue { i8, i64 } %22, 1
store i64 %23, i64* %20, align 8
%24 = extractvalue { i8, i64 } %22, 0
%25 = icmp eq i64 %5, 0
br i1 %25, label %bb23, label %bb22
bb23: ; preds = %bb22
%26 = zext i8 %19 to i64
%27 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %1
%28 = load i64, i64* %27, align 8
%29 = add i64 %28, %26
store i64 %29, i64* %27, align 8
%30 = icmp eq i64 %1, 0
br i1 %30, label %bb15, label %bb14
}
declare i32 @rust_eh_personality(...) unnamed_addr #1
declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64) #2
attributes #0 = { nounwind nonlazybind
"probe-stack"="__rust_probestack""target-cpu"="x86-64""target-features"="+adx,+bmi2"
}
attributes #1 = { nonlazybind "target-cpu"="x86-64" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"RtLibUseGOT", i32 1}
which gets compiled down to the following machine code:
example::mp_mul_intrin:
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
testq %rsi, %rsi
je .LBB0_5
movq %rdx, %r9
movq %rsi, %r10
negq %r10
movq %rsi, %rax
shlq $4, %rax
leaq (%r8,%rax), %r12
addq $-8, %r12
leaq (%rdi,%rsi,8), %r11
addq $-8, %r11
.LBB0_2:
leaq -1(%rsi), %r14
movq -8(%r9,%rsi,8), %r15
xorl %ebx, %ebx
xorl %ecx, %ecx
xorl %edi, %edi
.LBB0_3:
movq %r15, %rax
mulq (%r11,%rbx,8)
addb $-1, %dil
adcq %rax, (%r12,%rbx,8)
setb %dil
addb $-1, %cl
adcq %rdx, -8(%r12,%rbx,8)
setb %cl
addq $-1, %rbx
cmpq %rbx, %r10
jne .LBB0_3
movzbl %dil, %eax
addq %rax, -8(%r8,%rsi,8)
addq $-8, %r12
movq %r14, %rsi
testq %r14, %r14
jne .LBB0_2
.LBB0_5:
popq %rbx
popq %r12
popq %r14
popq %r15
retq
Implementing this operation using inline assembly with the expected machine
code output:
#[inline(never)]
#[no_mangle]
pub fn mp_mul_asm(a: &[u64], b: &[u64]) -> Vec<u64> {
unsafe {
let len = a.len();
assert_eq!(len, b.len());
let mut c = vec![0; len * 2];
asm!("
# start iteration of `b_i`: `len`-1 downto 0
lea -1($3), %rsi
1: # every iteration of `b_i`
# load `b_elem`
mov ($1, %rsi, 8), %rdx
# clear `carry_lo`, `carry_hi`
xor %r9, %r9
# start iteration of `a_i`+1: `len` downto 1
mov $3, %rcx
jmp 3f
2: # the end of every iteration of `a_i`+1, except the last iteration
# no displacement, RCX has already been decremented
mov %r10, (%r11, %rcx, 8)
3: # every iteration of `a_i`+1
# compute c + b_i
lea ($2, %rsi, 8), %r11
# compute a[a_i]*b_elem
mulx -8($0, %rcx, 8), %r8, %r9
# add low word
mov (%r11, %rcx, 8), %r10
adcx %r8, %r10
mov %r10, (%r11, %rcx, 8)
# add high word
mov -8(%r11, %rcx, 8), %r10
adox %r9, %r10
# this is done later to be able to add in last carry in last
iteration of `a_i`+1
# mov %r10, -8(%r11, %rcx, 8)
# advance `a_i`+1
lea -1(%rcx), %rcx
jrcxz 4f
jmp 2b
4: # the end of the last iteration of `a_i`+1
adc $$0, %r10
mov %r10, (%r11)
# advance `b_i`
dec %rsi
jns 1b
"
:
: "r"(a.as_ptr()), "r"(b.as_ptr()), "r"(c.as_mut_ptr()), "r"(len)
: "rsi", "rcx", "rdx", "r8", "r9", "r10", "r11", "flags"
);
c
}
}
and benchmarking both
(https://github.com/rust-lang-nursery/stdsimd/issues/666#issuecomment-485065551)
shows a significant performance difference: 390ns/iteration for the inline
assembly version vs 507ns/iteration for the one using `llvm.x86.addcarryx.u64`.
It appears that LLVM always replaces `llvm.x86.addcarryx.u64` with a polyfill
based on `llvm.x86.addcarry.u64` and then fails to emit adcx instructions.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190420/bdfd12a7/attachment.html>
More information about the llvm-bugs
mailing list