[llvm-dev] Is the correct behavior of getelementptr i192* foropt + llc -march=aarch64?

Fri Nov 11 18:05:24 PST 2016

Hi Mehdi,

>> No, My target is x86-64, x86, arm, aarch64, ..., then I'll avoid using i192* and datalayout.
>
>There is nothing specific with i192. You will likely run into issues by not specifying the right datalayout.
>
>The optimizations will always run with a datalayout: if you don’t specify one there will be a default one, which can cause problems on some target (like you saw on arm). 
>For instance, the optimizer will assume a pointer size and optimize based on this.

I write a code without i192* as the following, then I get what I wanted.
I'll rewrite the other code like this.

// load 192-bit data from %r2
define i192 @load192(i64* %r2)
{
%r3 = load i64, i64* %r2
%r4 = zext i64 %r3 to i128
%r6 = getelementptr i64, i64* %r2, i32 1
%r7 = load i64, i64* %r6
%r8 = zext i64 %r7 to i128
%r9 = shl i128 %r8, 64
%r10 = or i128 %r4, %r9
%r11 = zext i128 %r10 to i192
%r13 = getelementptr i64, i64* %r2, i32 2
%r14 = load i64, i64* %r13
%r15 = zext i64 %r14 to i192
%r16 = shl i192 %r15, 128
%r17 = or i192 %r11, %r16
ret i192 %r17
}

/*
  struct i192_t {
    uint64_t v[3];
  };
  void add(i192_t *y, const i192_t* x)
  {
    *y = x[0] + x[1]; // pseudo code
  }
*/
define void @add(i64* noalias  %r1, i64* noalias  %r2)
{
%r3 = call i192 @load192(i64* %r2)
%r5 = getelementptr i64, i64* %r2, i32 3
%r6 = call i192 @load192(i64* %r5)
%r7 = add i192 %r3, %r6
%r9 = getelementptr i64, i64* %r1, i32 0
%r10 = trunc i192 %r7 to i64
store i64 %r10, i64* %r9
%r11 = lshr i192 %r7, 64
%r13 = getelementptr i64, i64* %r1, i32 1
%r14 = trunc i192 %r11 to i64
store i64 %r14, i64* %r13
%r15 = lshr i192 %r11, 64
%r17 = getelementptr i64, i64* %r1, i32 2
%r18 = trunc i192 %r15 to i64
store i64 %r18, i64* %r17
ret void
}

% opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=x86-64
add:
        movq    16(%rsi), %rax
        movq    24(%rsi), %rcx
        movq    32(%rsi), %rdx
        addq    (%rsi), %rcx
        adcq    8(%rsi), %rdx
        adcq    40(%rsi), %rax
        movq    %rcx, (%rdi)
        movq    %rdx, 8(%rdi)
        movq    %rax, 16(%rdi)
        retq

% opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=aarch64
add:
        ldp             x8, x9, [x1]
        ldp     x10, x11, [x1, #24]
        ldr     x12, [x1, #16]
        ldr     x13, [x1, #40]
        adds            x8, x10, x8
        adcs    x9, x11, x9
        stp             x8, x9, [x0]
        adcs    x8, x13, x12
        str     x8, [x0, #16]
        ret

Yours,
 Shigeo