            Bug ID: 16725
           Summary: Inefficient code generated for i64 * i64 -> i128
                    multiplication in loop
The code generator currently doesn't expand a 128-bit multiplication of two
sign-extended integers into a SMUL_LOHI node if one operand is passed over a
loop boundary.

Example code:

#define SHIFT 63
#define MASK ((1LU << SHIFT) - 1)
void mul1(int64_t *arr, long arrsize, int64_t factor) {
  __int128 carry = 0;
  for (long i = 0; i < arrsize; ++i) {
    __int128 e = arr[i];
    carry += e * factor;
    arr[i] = carry & MASK;
    carry >>= SHIFT;

Clang generates this code for the inner loop:

movq    (%rdi), %rcx
movq    %rcx, %rbx
imulq    %r9, %rbx
movq    %rcx, %rax
mulq    %r8
addq    %rbx, %rdx
sarq    $63, %rcx
imulq    %r8, %rcx
addq    %rdx, %rcx
addq    %r10, %rax
adcq    %r14, %rcx
movq    %rcx, %r10
shldq    $1, %rax, %r10
andq    %r11, %rax
movq    %rax, (%rdi)
sarq    $63, %rcx
addq    $8, %rdi
decq    %rsi
movq    %rcx, %r14
jne    .LBB0_2

While gcc generates this code:

movq    %r8, %rax
imulq    (%rcx)
addq    %rsi, %rax
movq    %rax, %rsi
adcq    %rdi, %rdx
addq    $8, %rcx
andq    %r9, %rsi
movq    %rsi, -8(%rcx)
shrdq    $63, %rdx, %rax
sarq    $63, %rdx
cmpq    %r10, %rcx
movq    %rax, %rsi
movq    %rdx, %rdi
jne    .L4

Clang generates three multiplication instructions while gcc only generates one.

This is very likely the same issue as described in lib/Target/X86/README.txt:


This code:

void vec_mpys1(int y[], const int x[], int scaler) {
int i;
for (i = 0; i < 150; i++)
 y[i] += (((long long)scaler * (long long)x[i]) >> 31);

Compiles to this loop with GCC 3.x:

        movl    %ebx, %eax
        imull   (%edi,%ecx,4)
        shrdl   $31, %edx, %eax
        addl    %eax, (%esi,%ecx,4)
        incl    %ecx
        cmpl    $149, %ecx
        jle     .L5

llvm-gcc compiles it to the much uglier:

LBB1_1: ## bb1
        movl    24(%esp), %eax
        movl    (%eax,%edi,4), %ebx
        movl    %ebx, %ebp
        imull   %esi, %ebp
        movl    %ebx, %eax
        mull    %ecx
        addl    %ebp, %edx
        sarl    $31, %ebx
        imull   %ecx, %ebx
        addl    %edx, %ebx
        shldl   $1, %eax, %ebx
        movl    20(%esp), %eax
        addl    %ebx, (%eax,%edi,4)
        incl    %edi
        cmpl    $150, %edi
        jne     LBB1_1  ## bb1

The issue is that we hoist the cast of "scaler" to long long outside of the
loop, the value comes into the loop as two values, and
RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
constructed BUILD_PAIR which represents the cast value.

This can be handled by making CodeGenPrepare sink the cast.


