[LLVMbugs] [Bug 20748] New: @llvm.uadd.with.overflow.i32 (a.k.a. __builtin_addc) intrinsic produces worse-code than non-intrinsic version

Mon Aug 25 15:04:04 PDT 2014

http://llvm.org/bugs/show_bug.cgi?id=20748

            Bug ID: 20748
           Summary: @llvm.uadd.with.overflow.i32 (a.k.a. __builtin_addc)
                    intrinsic produces worse-code than non-intrinsic
                    version
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Common Code Generator Code
          Assignee: unassignedbugs at nondot.org
          Reporter: oneill+llvmbugs at cs.hmc.edu
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Created attachment 12934
  --> http://llvm.org/bugs/attachment.cgi?id=12934&action=edit
Intrinsic vs. nonintrinsic add-with-carry

LLVM and Clang claim to provide intrinsics that efficiently support
multiprecision arithmetic, described here

http://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins 

but the actual code produced is poor, and is actually *worse* than the code
LLVM produces if we hand code an equivalent function to the intrinsic.

For example, consider the attached code, which is based on code at the above
URL.  The version using the LLVM intrinsic produces:

_addc4:                                 ## @addc4
    .cfi_startproc
## BB#0:                                ## %entry
    pushq    %rbp
Ltmp3:
    .cfi_def_cfa_offset 16
Ltmp4:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp5:
    .cfi_def_cfa_register %rbp
    movl    (%rdi), %eax
    addl    (%rsi), %eax
    sbbl    %ecx, %ecx
    andl    $1, %ecx
    movl    %eax, (%rdx)
    movl    4(%rdi), %eax
    addl    4(%rsi), %eax
    sbbb    %r8b, %r8b
    addl    %ecx, %eax
    sbbb    %cl, %cl
    orb    %r8b, %cl
    andb    $1, %cl
    movzbl    %cl, %r8d
    movl    %eax, 4(%rdx)
    movl    8(%rdi), %eax
    addl    8(%rsi), %eax
    sbbb    %r9b, %r9b
    addl    %r8d, %eax
    sbbb    %cl, %cl
    orb    %r9b, %cl
    andb    $1, %cl
    movzbl    %cl, %ecx
    movl    %eax, 8(%rdx)
    movl    12(%rsi), %eax
    addl    12(%rdi), %eax
    addl    %ecx, %eax
    movl    %eax, 12(%rdx)
    popq    %rbp
    retq
    .cfi_endproc

with not an addc instruction in sight! (It *could* have been compiled down to
an add and three adc instructions.)

In contrast, if we compile with -DOVERRIDE_INTRINSIC, we get

_addc4:                                 ## @addc4
    .cfi_startproc
## BB#0:                                ## %entry
    pushq    %rbp
Ltmp3:
    .cfi_def_cfa_offset 16
Ltmp4:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp5:
    .cfi_def_cfa_register %rbp
    movl    (%rsi), %r8d
    movl    (%rdi), %ecx
    leal    (%r8,%rcx), %eax
    movl    %eax, (%rdx)
    movl    4(%rdi), %r9d
    movl    4(%rsi), %eax
    addl    %r9d, %eax
    addl    %r8d, %ecx
    adcl    $0, %eax
    movl    %eax, 4(%rdx)
    movl    8(%rdi), %r8d
    movl    8(%rsi), %ecx
    addl    %r8d, %ecx
    cmpl    %r9d, %eax
    adcl    $0, %ecx
    movl    %ecx, 8(%rdx)
    movl    12(%rsi), %eax
    addl    12(%rdi), %eax
    cmpl    %r8d, %ecx
    adcl    $0, %eax
    movl    %eax, 12(%rdx)
    popq    %rbp
    retq
    .cfi_endproc

which is still fairly poor code because it could optimize the adcl $0 into a
previous add, but is still way better than the intrinsic version.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20140825/8f5d8f93/attachment.html>