<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - [ARM][Codegen] va_args lowering incorrect"

   href="https://bugs.llvm.org/show_bug.cgi?id=42638">42638</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[ARM][Codegen] va_args lowering incorrect

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>diogo.sampaio@arm.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>diogo.sampaio@arm.com, htmldeveloper@gmail.com, llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>For this C code llvm generates incorrect and inefficient ARM code:

===

#include <stdarg.h>

struct __attribute((packed)) S {

  double M0;

};

double foo(int P0, ...) {

  {

    __attribute((aligned(8))) char V1[8];

    __asm volatile("" : : "r"(&V1[0]));

  }

  va_list vl;

  va_start(vl, P0);

  struct S v = va_arg(vl, struct S);

  return v.M0;

}

=======

Compiled with

clang --target=arm-arm-none-eabi -march=armv8-r -c mtest.c -S -O2

-mfloat-abi=softfp -mfpu=fpv5-sp-d16 -o -

will generate this assembly output:

foo:

        .fnstart

        .pad    #12

        sub     sp, sp, #12

        .pad    #20

        sub     sp, sp, #20

        str     r1, [sp, #20]

        add     r1, sp, #20

        add     r0, sp, #8

        str     r2, [sp, #24]

        str     r3, [sp, #28]

        @APP

        @NO_APP

        str     r1, [sp, #8]

        ldr     r0, [sp, #20]

        ldr     r2, [sp, #24]

        str     r0, [sp]

        ldm     sp, {r0, r1} @ ----- Error here: Dependency break    <<<

        str     r2, [sp, #4] @ ----- this should come before the ldm <<<

        add     sp, sp, #20

        add     sp, sp, #12

        bx      lr

=====

The incorrect dependence break occurs during machine-scheduler.

MIR prior to machine-scheduler:

=====

body:             |

  bb.0.entry:

    liveins: $r1, $r2, $r3

    %3:gpr = COPY $r3

    %2:gpr = COPY $r2

    %1:gpr = COPY $r1

    STRi12 %1, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)

    STRi12 %2, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0 +

4)

    STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)

    %4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg

    INLINEASM &"", 1, 327689, %4

    %5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg

    STRi12 %5, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)

    %6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from

%ir.argp.cur3)

    %7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3

+ 4)

    STRi12 %7, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4)

    %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)

    $r0 = COPY %6

    $r1 = COPY %8

    BX_RET 14, $noreg, implicit killed $r0, implicit $r1

=====

And after:

body:             |

  bb.0.entry:

    liveins: $r1, $r2, $r3

    %1:gpr = COPY $r1

    %2:gpr = COPY $r2

    %3:gpr = COPY $r3

    STRi12 %1, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)

    %4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg

    %5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg

    STRi12 %2, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0 +

4)

    STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)

    INLINEASM &"", 1, 327689, %4

    STRi12 %5, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)

    %6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from

%ir.argp.cur3)

    %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)

    %7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3

+ 4)

    $r0 = COPY %6

    $r1 = COPY %8

    STRi12 %7, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4)

    BX_RET 14, $noreg, implicit killed $r0, implicit $r1

=====

However, the error seems to appear before this pass. In the instruction

    %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)

This instruction misses the offset of + 4, so it does not detect the dependency

break by moving the store to after it.

Second, the instruction

STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)

is also incorrect, missing the pointer to where it is storing to

(store 4 into %fixed-stack.0 + 8)

So the invalid MIR generate is actually generate during the generation of the

MIR:

*** IR Dump After Module Verifier ***

define double @foo(i32 %P0, ...) #0 { 

entry:

  %V1 = alloca [8 x i8], align 8

  %vl = alloca %struct.__va_list, align 4

  %0 = bitcast [8 x i8]* %V1 to i8*

  call void asm sideeffect "", "r"(i8* nonnull %0)

  %1 = bitcast %struct.__va_list* %vl to i8*

  call void @llvm.va_start(i8* nonnull %1)

  %2 = bitcast %struct.__va_list* %vl to double**

  %argp.cur3 = load double*, double** %2, align 4

  %v.sroa.0.0.copyload = load double, double* %argp.cur3, align 4

  ret double %v.sroa.0.0.copyload

}

# *** IR Dump Before Finalize ISel and expand pseudo-instructions ***: 

# Machine code for function foo: IsSSA, TracksLiveness

Frame Objects:

  fi#-1: size=12, align=4, fixed, at location [SP-12]

  fi#0: size=8, align=8, at location [SP] 

  fi#1: size=4, align=4, at location [SP] 

  fi#2: size=8, align=8, at location [SP] 

Function Live Ins: $r1 in %1, $r2 in %2, $r3 in %3

bb.0.entry:

  liveins: $r1, $r2, $r3

  %3:gpr = COPY $r3

  %2:gpr = COPY $r2

  %1:gpr = COPY $r1

  STRi12 %1:gpr, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)

  STRi12 %2:gpr, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0

+ 4) 

  STRi12 %3:gpr, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)

  %4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg

  INLINEASM &"" [sideeffect] [attdialect], $0:[reguse:GPR], %4:gpr

  %5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg

  STRi12 killed %5:gpr, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)

  %6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from %ir.argp.cur3)

  %7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3 +

4) 

  STRi12 killed %7:gpr, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4) 

  %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)

  $r0 = COPY %6:gpr

  $r1 = COPY %8:gpr

  BX_RET 14, $noreg, implicit $r0, implicit $r1

# End machine code for function foo. 

====

For last, it is highly inefficient, as gcc produces much smaller (still not

optimal) code. It seems llvm forces spilling the values before the inline

assembly.

<a href="https://godbolt.org/z/CjWSWb">https://godbolt.org/z/CjWSWb</a></pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>