<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - [ARM][Codegen] va_args lowering incorrect"
href="https://bugs.llvm.org/show_bug.cgi?id=42638">42638</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>[ARM][Codegen] va_args lowering incorrect
</td>
</tr>
<tr>
<th>Product</th>
<td>new-bugs
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>All
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>new bugs
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>diogo.sampaio@arm.com
</td>
</tr>
<tr>
<th>CC</th>
<td>diogo.sampaio@arm.com, htmldeveloper@gmail.com, llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>For this C code llvm generates incorrect and inefficient ARM code:
===
#include <stdarg.h>
struct __attribute((packed)) S {
double M0;
};
double foo(int P0, ...) {
{
__attribute((aligned(8))) char V1[8];
__asm volatile("" : : "r"(&V1[0]));
}
va_list vl;
va_start(vl, P0);
struct S v = va_arg(vl, struct S);
return v.M0;
}
=======
Compiled with
clang --target=arm-arm-none-eabi -march=armv8-r -c mtest.c -S -O2
-mfloat-abi=softfp -mfpu=fpv5-sp-d16 -o -
will generate this assembly output:
foo:
.fnstart
.pad #12
sub sp, sp, #12
.pad #20
sub sp, sp, #20
str r1, [sp, #20]
add r1, sp, #20
add r0, sp, #8
str r2, [sp, #24]
str r3, [sp, #28]
@APP
@NO_APP
str r1, [sp, #8]
ldr r0, [sp, #20]
ldr r2, [sp, #24]
str r0, [sp]
ldm sp, {r0, r1} @ ----- Error here: Dependency break <<<
str r2, [sp, #4] @ ----- this should come before the ldm <<<
add sp, sp, #20
add sp, sp, #12
bx lr
=====
The incorrect dependence break occurs during machine-scheduler.
MIR prior to machine-scheduler:
=====
body: |
bb.0.entry:
liveins: $r1, $r2, $r3
%3:gpr = COPY $r3
%2:gpr = COPY $r2
%1:gpr = COPY $r1
STRi12 %1, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)
STRi12 %2, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0 +
4)
STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)
%4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg
INLINEASM &"", 1, 327689, %4
%5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg
STRi12 %5, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)
%6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from
%ir.argp.cur3)
%7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3
+ 4)
STRi12 %7, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4)
%8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)
$r0 = COPY %6
$r1 = COPY %8
BX_RET 14, $noreg, implicit killed $r0, implicit $r1
=====
And after:
body: |
bb.0.entry:
liveins: $r1, $r2, $r3
%1:gpr = COPY $r1
%2:gpr = COPY $r2
%3:gpr = COPY $r3
STRi12 %1, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)
%4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg
%5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg
STRi12 %2, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0 +
4)
STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)
INLINEASM &"", 1, 327689, %4
STRi12 %5, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)
%6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from
%ir.argp.cur3)
%8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)
%7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3
+ 4)
$r0 = COPY %6
$r1 = COPY %8
STRi12 %7, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4)
BX_RET 14, $noreg, implicit killed $r0, implicit $r1
=====
However, the error seems to appear before this pass. In the instruction
%8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)
This instruction misses the offset of + 4, so it does not detect the dependency
break by moving the store to after it.
Second, the instruction
STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)
is also incorrect, missing the pointer to where it is storing to
(store 4 into %fixed-stack.0 + 8)
So the invalid MIR generate is actually generate during the generation of the
MIR:
*** IR Dump After Module Verifier ***
define double @foo(i32 %P0, ...) #0 {
entry:
%V1 = alloca [8 x i8], align 8
%vl = alloca %struct.__va_list, align 4
%0 = bitcast [8 x i8]* %V1 to i8*
call void asm sideeffect "", "r"(i8* nonnull %0)
%1 = bitcast %struct.__va_list* %vl to i8*
call void @llvm.va_start(i8* nonnull %1)
%2 = bitcast %struct.__va_list* %vl to double**
%argp.cur3 = load double*, double** %2, align 4
%v.sroa.0.0.copyload = load double, double* %argp.cur3, align 4
ret double %v.sroa.0.0.copyload
}
# *** IR Dump Before Finalize ISel and expand pseudo-instructions ***:
# Machine code for function foo: IsSSA, TracksLiveness
Frame Objects:
fi#-1: size=12, align=4, fixed, at location [SP-12]
fi#0: size=8, align=8, at location [SP]
fi#1: size=4, align=4, at location [SP]
fi#2: size=8, align=8, at location [SP]
Function Live Ins: $r1 in %1, $r2 in %2, $r3 in %3
bb.0.entry:
liveins: $r1, $r2, $r3
%3:gpr = COPY $r3
%2:gpr = COPY $r2
%1:gpr = COPY $r1
STRi12 %1:gpr, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)
STRi12 %2:gpr, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0
+ 4)
STRi12 %3:gpr, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)
%4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg
INLINEASM &"" [sideeffect] [attdialect], $0:[reguse:GPR], %4:gpr
%5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg
STRi12 killed %5:gpr, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)
%6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from %ir.argp.cur3)
%7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3 +
4)
STRi12 killed %7:gpr, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4)
%8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)
$r0 = COPY %6:gpr
$r1 = COPY %8:gpr
BX_RET 14, $noreg, implicit $r0, implicit $r1
# End machine code for function foo.
====
For last, it is highly inefficient, as gcc produces much smaller (still not
optimal) code. It seems llvm forces spilling the values before the inline
assembly.
<a href="https://godbolt.org/z/CjWSWb">https://godbolt.org/z/CjWSWb</a></pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>