[llvm-bugs] [Bug 42638] New: [ARM][Codegen] va_args lowering incorrect

Tue Jul 16 09:13:54 PDT 2019

https://bugs.llvm.org/show_bug.cgi?id=42638

            Bug ID: 42638
           Summary: [ARM][Codegen] va_args lowering incorrect
           Product: new-bugs
           Version: trunk
          Hardware: All
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: diogo.sampaio at arm.com
                CC: diogo.sampaio at arm.com, htmldeveloper at gmail.com,
                    llvm-bugs at lists.llvm.org

For this C code llvm generates incorrect and inefficient ARM code:
===
#include <stdarg.h>

struct __attribute((packed)) S {
  double M0;
};

double foo(int P0, ...) {
  {
    __attribute((aligned(8))) char V1[8];
    __asm volatile("" : : "r"(&V1[0]));
  }

  va_list vl;
  va_start(vl, P0);
  struct S v = va_arg(vl, struct S);
  return v.M0;
}

=======
Compiled with
clang --target=arm-arm-none-eabi -march=armv8-r -c mtest.c -S -O2
-mfloat-abi=softfp -mfpu=fpv5-sp-d16 -o -

will generate this assembly output:

foo:
        .fnstart
        .pad    #12
        sub     sp, sp, #12
        .pad    #20
        sub     sp, sp, #20
        str     r1, [sp, #20]
        add     r1, sp, #20
        add     r0, sp, #8
        str     r2, [sp, #24]
        str     r3, [sp, #28]
        @APP
        @NO_APP
        str     r1, [sp, #8]
        ldr     r0, [sp, #20]
        ldr     r2, [sp, #24]
        str     r0, [sp]
        ldm     sp, {r0, r1} @ ----- Error here: Dependency break    <<<
        str     r2, [sp, #4] @ ----- this should come before the ldm <<<
        add     sp, sp, #20
        add     sp, sp, #12
        bx      lr
=====
The incorrect dependence break occurs during machine-scheduler.
MIR prior to machine-scheduler:
=====
body:             |
  bb.0.entry:
    liveins: $r1, $r2, $r3

    %3:gpr = COPY $r3
    %2:gpr = COPY $r2
    %1:gpr = COPY $r1
    STRi12 %1, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)
    STRi12 %2, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0 +
4)
    STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)
    %4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg
    INLINEASM &"", 1, 327689, %4
    %5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg
    STRi12 %5, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)
    %6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from
%ir.argp.cur3)
    %7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3
+ 4)
    STRi12 %7, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4)
    %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)
    $r0 = COPY %6
    $r1 = COPY %8
    BX_RET 14, $noreg, implicit killed $r0, implicit $r1
=====
And after:
body:             |
  bb.0.entry:
    liveins: $r1, $r2, $r3

    %1:gpr = COPY $r1
    %2:gpr = COPY $r2
    %3:gpr = COPY $r3
    STRi12 %1, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)
    %4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg
    %5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg
    STRi12 %2, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0 +
4)
    STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)
    INLINEASM &"", 1, 327689, %4
    STRi12 %5, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)
    %6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from
%ir.argp.cur3)
    %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)
    %7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3
+ 4)
    $r0 = COPY %6
    $r1 = COPY %8
    STRi12 %7, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4)
    BX_RET 14, $noreg, implicit killed $r0, implicit $r1
=====
However, the error seems to appear before this pass. In the instruction

    %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)

This instruction misses the offset of + 4, so it does not detect the dependency
break by moving the store to after it.

Second, the instruction

STRi12 %3, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)

is also incorrect, missing the pointer to where it is storing to

(store 4 into %fixed-stack.0 + 8)

So the invalid MIR generate is actually generate during the generation of the
MIR:

*** IR Dump After Module Verifier ***
define double @foo(i32 %P0, ...) #0 { 
entry:
  %V1 = alloca [8 x i8], align 8
  %vl = alloca %struct.__va_list, align 4
  %0 = bitcast [8 x i8]* %V1 to i8*
  call void asm sideeffect "", "r"(i8* nonnull %0)
  %1 = bitcast %struct.__va_list* %vl to i8*
  call void @llvm.va_start(i8* nonnull %1)
  %2 = bitcast %struct.__va_list* %vl to double**
  %argp.cur3 = load double*, double** %2, align 4
  %v.sroa.0.0.copyload = load double, double* %argp.cur3, align 4
  ret double %v.sroa.0.0.copyload
}
# *** IR Dump Before Finalize ISel and expand pseudo-instructions ***: 
# Machine code for function foo: IsSSA, TracksLiveness
Frame Objects:
  fi#-1: size=12, align=4, fixed, at location [SP-12]
  fi#0: size=8, align=8, at location [SP] 
  fi#1: size=4, align=4, at location [SP] 
  fi#2: size=8, align=8, at location [SP] 
Function Live Ins: $r1 in %1, $r2 in %2, $r3 in %3

bb.0.entry:
  liveins: $r1, $r2, $r3
  %3:gpr = COPY $r3
  %2:gpr = COPY $r2
  %1:gpr = COPY $r1
  STRi12 %1:gpr, %fixed-stack.0, 0, 14, $noreg :: (store 4 into %fixed-stack.0)
  STRi12 %2:gpr, %fixed-stack.0, 4, 14, $noreg :: (store 4 into %fixed-stack.0
+ 4) 
  STRi12 %3:gpr, %fixed-stack.0, 8, 14, $noreg :: (store 4 + 8)
  %4:gpr = ADDri %stack.0.V1, 0, 14, $noreg, $noreg
  INLINEASM &"" [sideeffect] [attdialect], $0:[reguse:GPR], %4:gpr
  %5:gpr = ADDri %fixed-stack.0, 0, 14, $noreg, $noreg
  STRi12 killed %5:gpr, %stack.1.vl, 0, 14, $noreg :: (store 4 into %ir.1)
  %6:gpr = LDRi12 %fixed-stack.0, 0, 14, $noreg :: (load 4 from %ir.argp.cur3)
  %7:gpr = LDRi12 %fixed-stack.0, 4, 14, $noreg :: (load 4 from %ir.argp.cur3 +
4) 
  STRi12 killed %7:gpr, %stack.2, 4, 14, $noreg :: (store 4 into %stack.2 + 4) 
  %8:gpr = LDRi12 %stack.2, 4, 14, $noreg :: (load 4 from %stack.2)
  $r0 = COPY %6:gpr
  $r1 = COPY %8:gpr
  BX_RET 14, $noreg, implicit $r0, implicit $r1

# End machine code for function foo. 

====
For last, it is highly inefficient, as gcc produces much smaller (still not
optimal) code. It seems llvm forces spilling the values before the inline
assembly.

https://godbolt.org/z/CjWSWb

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190716/e3044911/attachment-0001.html>