[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address
Frank Winter
fwinter at jlab.org
Wed Jul 29 13:02:44 PDT 2015
When I compile attached IR with LLVM 3.6
llc -march=x86-64 -o f.S f.ll
it generates an aligned ADDPS with unaligned address. See attached f.S,
here an extract:
addq $12, %r9 # $12 is not a multiple of 4, thus for
xmm0 this is unaligned
xorl %esi, %esi
.align 16, 0x90
.LBB0_1: # %loop2
# =>This Inner Loop Header: Depth=1
movq offset_array3(,%rsi,8), %rdi
movq offset_array2(,%rsi,8), %r10
movss -28(%rax), %xmm0
movss -8(%rax), %xmm1
movss -4(%rax), %xmm2
unpcklps %xmm0, %xmm2 # xmm2 =
xmm2[0],xmm0[0],xmm2[1],xmm0[1]
movss (%rax), %xmm0
unpcklps %xmm0, %xmm1 # xmm1 =
xmm1[0],xmm0[0],xmm1[1],xmm0[1]
unpcklps %xmm2, %xmm1 # xmm1 =
xmm1[0],xmm2[0],xmm1[1],xmm2[1]
addps (%r9), %xmm1 # here, it gets used, causes a
segfault
Frank
-------------- next part --------------
;; ModuleID = 'module'
target triple = "x86_64-unknown-linux-gnu"
@offset_array2 = internal constant [8 x i64] [i64 60, i64 4, i64 12, i64 20, i64 28, i64 36, i64 44, i64 52]
@offset_array3 = internal constant [8 x i64] [i64 12, i64 20, i64 28, i64 36, i64 44, i64 52, i64 60, i64 4]
declare float @sinf(float)
declare float @acosf(float)
declare float @asinf(float)
declare float @atanf(float)
declare float @ceilf(float)
declare float @floorf(float)
declare float @cosf(float)
declare float @coshf(float)
declare float @expf(float)
declare float @logf(float)
declare float @log10f(float)
declare float @sinhf(float)
declare float @tanf(float)
declare float @tanhf(float)
declare float @fabsf(float)
declare float @sqrtf(float)
declare float @powf(float, float)
declare float @atan2f(float, float)
declare double @sin(double)
declare double @acos(double)
declare double @asin(double)
declare double @atan(double)
declare double @ceil(double)
declare double @floor(double)
declare double @cos(double)
declare double @cosh(double)
declare double @exp(double)
declare double @log(double)
declare double @log10(double)
declare double @sinh(double)
declare double @tan(double)
declare double @tanh(double)
declare double @fabs(double)
declare double @sqrt(double)
declare double @pow(double, double)
declare double @atan2(double, double)
define void @func(i64 %lo, i64 %hi, float* %arg0, float* %arg1, float* %arg2, float* %arg3, float* %arg4) {
pre_loop3:
br label %loop2
loop2: ; preds = %loop2, %pre_loop3
%0 = phi i64 [ 0, %pre_loop3 ], [ %42, %loop2 ]
%1 = getelementptr [8 x i64]* @offset_array3, i64 0, i64 %0
%2 = load i64* %1
%3 = getelementptr [8 x i64]* @offset_array2, i64 0, i64 %0
%4 = load i64* %3
%5 = getelementptr float* %arg1, i64 %4
%6 = bitcast float* %5 to <4 x float>*
%7 = load <4 x float>* %6
%8 = getelementptr float* %arg2, i64 %2
%9 = bitcast float* %8 to <4 x float>*
%10 = load <4 x float>* %9
%11 = mul i64 %0, 8
%12 = add i64 %11, 3 ; <--------- this creates the unaligned address!!
%13 = getelementptr float* %arg3, i64 %12
%14 = bitcast float* %13 to <4 x float>*
%15 = load <4 x float>* %14
%16 = mul i64 %0, 8
%17 = add i64 %16, 5
%18 = getelementptr float* %arg4, i64 %17
%19 = load float* %18
%20 = mul i64 %0, 8
%21 = add i64 %20, 6
%22 = getelementptr float* %arg4, i64 %21
%23 = load float* %22
%24 = mul i64 %0, 8
%25 = add i64 %24, 7
%26 = getelementptr float* %arg4, i64 %25
%27 = load float* %26
%28 = mul i64 %0, 8
%29 = getelementptr float* %arg4, i64 %28
%30 = load float* %29
%31 = insertelement <4 x float> undef, float %19, i32 0
%32 = insertelement <4 x float> %31, float %23, i32 1
%33 = insertelement <4 x float> %32, float %27, i32 2
%34 = insertelement <4 x float> %33, float %30, i32 3
%35 = mul i64 %0, 8
%36 = add i64 %35, 4
%37 = getelementptr float* %arg0, i64 %36
%38 = fadd <4 x float> %34, %15
%39 = fadd <4 x float> %38, %10
%40 = fadd <4 x float> %39, %7
%41 = bitcast float* %37 to <4 x float>*
store <4 x float> %40, <4 x float>* %41
%42 = add nsw i64 %0, 1
%43 = icmp uge i64 %42, 8
br i1 %43, label %exit_loop1, label %loop2
exit_loop1: ; preds = %loop2
br label %pre_loop
pre_loop: ; preds = %exit_loop1
br label %entrypoint
entrypoint: ; preds = %vectorized
ret void
}
-------------- next part --------------
.text
.file "f.ll"
.globl func
.align 16, 0x90
.type func, at function
func: # @func
.cfi_startproc
# BB#0: # %pre_loop3
movq 8(%rsp), %rax
addq $16, %rdx
addq $28, %rax
addq $12, %r9
xorl %esi, %esi
.align 16, 0x90
.LBB0_1: # %loop2
# =>This Inner Loop Header: Depth=1
movq offset_array3(,%rsi,8), %rdi
movq offset_array2(,%rsi,8), %r10
movss -28(%rax), %xmm0
movss -8(%rax), %xmm1
movss -4(%rax), %xmm2
unpcklps %xmm0, %xmm2 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
movss (%rax), %xmm0
unpcklps %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
unpcklps %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
addps (%r9), %xmm1
addps (%r8,%rdi,4), %xmm1
addps (%rcx,%r10,4), %xmm1
movaps %xmm1, (%rdx)
incq %rsi
addq $32, %rdx
addq $32, %rax
addq $32, %r9
cmpq $8, %rsi
jb .LBB0_1
# BB#2: # %entrypoint
retq
.Ltmp0:
.size func, .Ltmp0-func
.cfi_endproc
.type offset_array2, at object # @offset_array2
.section .rodata,"a", at progbits
.align 16
offset_array2:
.quad 60 # 0x3c
.quad 4 # 0x4
.quad 12 # 0xc
.quad 20 # 0x14
.quad 28 # 0x1c
.quad 36 # 0x24
.quad 44 # 0x2c
.quad 52 # 0x34
.size offset_array2, 64
.type offset_array3, at object # @offset_array3
.align 16
offset_array3:
.quad 12 # 0xc
.quad 20 # 0x14
.quad 28 # 0x1c
.quad 36 # 0x24
.quad 44 # 0x2c
.quad 52 # 0x34
.quad 60 # 0x3c
.quad 4 # 0x4
.size offset_array3, 64
.section ".note.GNU-stack","", at progbits
More information about the llvm-dev
mailing list