[llvm-bugs] [Bug 41546] New: suboptimal codegen for llvm.x86.addcarryx.u32 and u64

Sat Apr 20 02:06:07 PDT 2019

https://bugs.llvm.org/show_bug.cgi?id=41546

            Bug ID: 41546
           Summary: suboptimal codegen for llvm.x86.addcarryx.u32 and u64
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: gonzalobg88 at gmail.com
                CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
                    llvm-dev at redking.me.uk, spatel+llvm at rotateright.com

MP multiply is documented by Intel as one of the main usecases of addcarryx
intrinsics
(https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf).

We implement this in Rust as:

#[target_feature(enable = "adx,bmi2")]
pub unsafe fn mp_mul_intrin(a: &[u64], b: &[u64], c: &mut [u64]) {
    let len = a.len();
    if len != b.len() {
        unreachable_unchecked();
    }
    if c.len() != len * 2 {
        unreachable_unchecked();
    }

    for b_i in (0..len).into_iter().rev() {
        let b_elem = *b.get_unchecked(b_i);
        let mut carry_lo = 0;
        let mut carry_hi = 0;
        for a_i in (0..len).into_iter().rev() {
            let mut hi = 0;
            let lo = _mulx_u64(*a.get_unchecked(a_i), b_elem, &mut hi);
            carry_lo = _addcarryx_u64(
                carry_lo,
                lo,
                *c.get_unchecked(b_i + a_i + 1),
                c.get_unchecked_mut(b_i + a_i + 1),
            );
            carry_hi = _addcarryx_u64(
                carry_hi,
                hi,
                *c.get_unchecked(b_i + a_i),
                c.get_unchecked_mut(b_i + a_i),
            );
        }
        *c.get_unchecked_mut(b_i) += carry_lo as u64;
    }
}

which produces the following LLVM-IR after optimizations
(https://rust.godbolt.org/z/EJFEHB):

ource_filename = "example.3a1fbbbh-cgu.0"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_ZN7example13mp_mul_intrin17ha949a97694eafb34E([0 x i64]* noalias
nocapture nonnull readonly align 8 %a.0, i64 %a.1, [0 x i64]* noalias nocapture
nonnull readonly align 8 %b.0, i64 %b.1, [0 x i64]* nocapture nonnull align 8
%c.0, i64 %c.1) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
start:
  %0 = icmp eq i64 %a.1, 0
  br i1 %0, label %bb15, label %bb14

bb14: ; preds = %start, %bb23
  %iter.sroa.4.040 = phi i64 [ %1, %bb23 ], [ %a.1, %start ]
  %1 = add i64 %iter.sroa.4.040, -1
  %2 = getelementptr inbounds [0 x i64], [0 x i64]* %b.0, i64 0, i64 %1
  %3 = load i64, i64* %2, align 8
  %4 = zext i64 %3 to i128
  br label %bb22

bb15: ; preds = %bb23, %start
  ret void

bb22: ; preds = %bb14, %bb22
  %carry_hi.039 = phi i8 [ 0, %bb14 ], [ %24, %bb22 ]
  %carry_lo.038 = phi i8 [ 0, %bb14 ], [ %19, %bb22 ]
  %iter2.sroa.4.037 = phi i64 [ %a.1, %bb14 ], [ %5, %bb22 ]
  %5 = add i64 %iter2.sroa.4.037, -1
  %6 = getelementptr inbounds [0 x i64], [0 x i64]* %a.0, i64 0, i64 %5
  %7 = load i64, i64* %6, align 8
  %8 = zext i64 %7 to i128
  %9 = mul nuw i128 %8, %4
  %10 = lshr i128 %9, 64
  %11 = trunc i128 %10 to i64
  %12 = trunc i128 %9 to i64
  %13 = add i64 %5, %1
  %14 = add i64 %iter2.sroa.4.037, %1
  %15 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %14
  %16 = load i64, i64* %15, align 8
  %17 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_lo.038, i64 %12,
i64 %16) #3
  %18 = extractvalue { i8, i64 } %17, 1
  store i64 %18, i64* %15, align 8
  %19 = extractvalue { i8, i64 } %17, 0
  %20 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %13
  %21 = load i64, i64* %20, align 8
  %22 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_hi.039, i64 %11,
i64 %21) #3
  %23 = extractvalue { i8, i64 } %22, 1
  store i64 %23, i64* %20, align 8
  %24 = extractvalue { i8, i64 } %22, 0
  %25 = icmp eq i64 %5, 0
  br i1 %25, label %bb23, label %bb22

bb23: ; preds = %bb22
  %26 = zext i8 %19 to i64
  %27 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %1
  %28 = load i64, i64* %27, align 8
  %29 = add i64 %28, %26
  store i64 %29, i64* %27, align 8
  %30 = icmp eq i64 %1, 0
  br i1 %30, label %bb15, label %bb14
}

declare i32 @rust_eh_personality(...) unnamed_addr #1

declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64) #2

attributes #0 = { nounwind nonlazybind
"probe-stack"="__rust_probestack""target-cpu"="x86-64""target-features"="+adx,+bmi2"
}
attributes #1 = { nonlazybind "target-cpu"="x86-64" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"RtLibUseGOT", i32 1}

which gets compiled down to the following machine code:

example::mp_mul_intrin:
  pushq %r15
  pushq %r14
  pushq %r12
  pushq %rbx
  testq %rsi, %rsi
  je .LBB0_5
  movq %rdx, %r9
  movq %rsi, %r10
  negq %r10
  movq %rsi, %rax
  shlq $4, %rax
  leaq (%r8,%rax), %r12
  addq $-8, %r12
  leaq (%rdi,%rsi,8), %r11
  addq $-8, %r11
.LBB0_2:
  leaq -1(%rsi), %r14
  movq -8(%r9,%rsi,8), %r15
  xorl %ebx, %ebx
  xorl %ecx, %ecx
  xorl %edi, %edi
.LBB0_3:
  movq %r15, %rax
  mulq (%r11,%rbx,8)
  addb $-1, %dil
  adcq %rax, (%r12,%rbx,8)
  setb %dil
  addb $-1, %cl
  adcq %rdx, -8(%r12,%rbx,8)
  setb %cl
  addq $-1, %rbx
  cmpq %rbx, %r10
  jne .LBB0_3
  movzbl %dil, %eax
  addq %rax, -8(%r8,%rsi,8)
  addq $-8, %r12
  movq %r14, %rsi
  testq %r14, %r14
  jne .LBB0_2
.LBB0_5:
  popq %rbx
  popq %r12
  popq %r14
  popq %r15
  retq

Implementing this operation using inline assembly with the expected machine
code output:

#[inline(never)]
#[no_mangle]
pub fn mp_mul_asm(a: &[u64], b: &[u64]) -> Vec<u64> {
    unsafe {
        let len = a.len();
        assert_eq!(len, b.len());
        let mut c = vec![0; len * 2];

        asm!("
    # start iteration of `b_i`: `len`-1 downto 0
    lea -1($3), %rsi
1:  # every iteration of `b_i`
        # load `b_elem`
        mov ($1, %rsi, 8), %rdx
        # clear `carry_lo`, `carry_hi`
        xor %r9, %r9
        # start iteration of `a_i`+1: `len` downto 1
        mov $3, %rcx
        jmp 3f
2:          # the end of every iteration of `a_i`+1, except the last iteration
            # no displacement, RCX has already been decremented
            mov %r10, (%r11, %rcx, 8)
3:      # every iteration of `a_i`+1
            # compute c + b_i
            lea ($2, %rsi, 8), %r11
            # compute a[a_i]*b_elem
            mulx -8($0, %rcx, 8), %r8, %r9
            # add low word
            mov (%r11, %rcx, 8), %r10
            adcx %r8, %r10
            mov %r10, (%r11, %rcx, 8)
            # add high word
            mov -8(%r11, %rcx, 8), %r10
            adox %r9, %r10
            # this is done later to be able to add in last carry in last
iteration of `a_i`+1
            # mov %r10, -8(%r11, %rcx, 8)
            # advance `a_i`+1
            lea -1(%rcx), %rcx
            jrcxz 4f
            jmp 2b
4:          # the end of the last iteration of `a_i`+1
            adc $$0, %r10
            mov %r10, (%r11)
        # advance `b_i`
        dec %rsi
        jns 1b
"
            :
            : "r"(a.as_ptr()), "r"(b.as_ptr()), "r"(c.as_mut_ptr()), "r"(len)
            : "rsi", "rcx", "rdx", "r8", "r9", "r10", "r11", "flags"
        );

        c
    }
}

and benchmarking both
(https://github.com/rust-lang-nursery/stdsimd/issues/666#issuecomment-485065551)
shows a significant performance difference: 390ns/iteration for the inline
assembly version vs 507ns/iteration for the one using `llvm.x86.addcarryx.u64`.

It appears that LLVM always replaces `llvm.x86.addcarryx.u64` with a polyfill
based on `llvm.x86.addcarry.u64` and then fails to emit adcx instructions.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190420/bdfd12a7/attachment.html>