[LLVMbugs] [Bug 16725] New: Inefficient code generated for i64 * i64 -> i128 multiplication in loop

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Sun Jul 28 09:45:31 PDT 2013


http://llvm.org/bugs/show_bug.cgi?id=16725

            Bug ID: 16725
           Summary: Inefficient code generated for i64 * i64 -> i128
                    multiplication in loop
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Common Code Generator Code
          Assignee: unassignedbugs at nondot.org
          Reporter: jacob.manuel at yahoo.de
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Created attachment 10942
  --> http://llvm.org/bugs/attachment.cgi?id=10942&action=edit
clang mul1.c -S -O3 -std=c99 -o mul1_clang.s

The code generator currently doesn't expand a 128-bit multiplication of two
sign-extended integers into a SMUL_LOHI node if one operand is passed over a
loop boundary.


Example code:

#define SHIFT 63
#define MASK ((1LU << SHIFT) - 1)
void mul1(int64_t *arr, long arrsize, int64_t factor) {
  __int128 carry = 0;
  for (long i = 0; i < arrsize; ++i) {
    __int128 e = arr[i];
    carry += e * factor;
    arr[i] = carry & MASK;
    carry >>= SHIFT;
  }
}


Clang generates this code for the inner loop:

movq    (%rdi), %rcx
movq    %rcx, %rbx
imulq    %r9, %rbx
movq    %rcx, %rax
mulq    %r8
addq    %rbx, %rdx
sarq    $63, %rcx
imulq    %r8, %rcx
addq    %rdx, %rcx
addq    %r10, %rax
adcq    %r14, %rcx
movq    %rcx, %r10
shldq    $1, %rax, %r10
andq    %r11, %rax
movq    %rax, (%rdi)
sarq    $63, %rcx
addq    $8, %rdi
decq    %rsi
movq    %rcx, %r14
jne    .LBB0_2


While gcc generates this code:

movq    %r8, %rax
imulq    (%rcx)
addq    %rsi, %rax
movq    %rax, %rsi
adcq    %rdi, %rdx
addq    $8, %rcx
andq    %r9, %rsi
movq    %rsi, -8(%rcx)
shrdq    $63, %rdx, %rax
sarq    $63, %rdx
cmpq    %r10, %rcx
movq    %rax, %rsi
movq    %rdx, %rdi
jne    .L4


Clang generates three multiplication instructions while gcc only generates one.


This is very likely the same issue as described in lib/Target/X86/README.txt:

//===---------------------------------------------------------------------===//

This code:

void vec_mpys1(int y[], const int x[], int scaler) {
int i;
for (i = 0; i < 150; i++)
 y[i] += (((long long)scaler * (long long)x[i]) >> 31);
}

Compiles to this loop with GCC 3.x:

.L5:
        movl    %ebx, %eax
        imull   (%edi,%ecx,4)
        shrdl   $31, %edx, %eax
        addl    %eax, (%esi,%ecx,4)
        incl    %ecx
        cmpl    $149, %ecx
        jle     .L5

llvm-gcc compiles it to the much uglier:

LBB1_1: ## bb1
        movl    24(%esp), %eax
        movl    (%eax,%edi,4), %ebx
        movl    %ebx, %ebp
        imull   %esi, %ebp
        movl    %ebx, %eax
        mull    %ecx
        addl    %ebp, %edx
        sarl    $31, %ebx
        imull   %ecx, %ebx
        addl    %edx, %ebx
        shldl   $1, %eax, %ebx
        movl    20(%esp), %eax
        addl    %ebx, (%eax,%edi,4)
        incl    %edi
        cmpl    $150, %edi
        jne     LBB1_1  ## bb1

The issue is that we hoist the cast of "scaler" to long long outside of the
loop, the value comes into the loop as two values, and
RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
constructed BUILD_PAIR which represents the cast value.

This can be handled by making CodeGenPrepare sink the cast.

//===---------------------------------------------------------------------===//

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20130728/433f1114/attachment.html>


More information about the llvm-bugs mailing list