[llvm-bugs] [Bug 26723] New: Improve code size with a more space-efficient vtable ABI

Wed Feb 24 00:00:19 PST 2016

https://llvm.org/bugs/show_bug.cgi?id=26723

            Bug ID: 26723
           Summary: Improve code size with a more space-efficient vtable
                    ABI
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Interprocedural Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: peter at pcc.me.uk
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

A more space-efficient representation for virtual tables would store 32-bit
address-point-relative offsets to virtual functions instead of virtual
function pointers. By using this representation, we avoid the need to emit
dynamic relocations for the virtual function pointers. On ELF, taking into
account the size of the dynamic relocation entry, this would save 32-4=28
bytes per virtual table slot on 64-bit architectures, or 16-4=12 bytes per
slot on 32-bit architectures.

This is an ABI break, but we can practically deploy it with whole program
visibility, which can be assured at LTO time using something along the lines
of the bitset metadata currently produced by -fwhole-program-vtables.

By avoiding these dynamic relocations, this would also save the program some
startup time, and when RTTI is disabled there would be no need to store any
dynamically relocated pointers in the vtable region, so the vtable pages could
be shared among processes.

See also PR24782. I suspect that vtables are writable on Mac because of
these dynamic relocations.

Code generation example:

#include <stdint.h>

struct A;

struct A_vtable {
  int32_t f_offset;
  int32_t g_offset;
  int32_t h_offset;
  void (*i)(A *);
};

struct A {
  A_vtable *vtable;
};

void fcall(A *a) {
  auto vt = a->vtable;
  auto fp = reinterpret_cast<void (*)(A *)>(reinterpret_cast<char *>(vt) +
vt->f_offset);
  fp(a);
}

void gcall(A *a) {
  auto vt = a->vtable;
  auto fp = reinterpret_cast<void (*)(A *)>(reinterpret_cast<char *>(vt) +
vt->g_offset);
  fp(a);
}

void hcall(A *a) {
  auto vt = a->vtable;
  auto fp = reinterpret_cast<void (*)(A *)>(reinterpret_cast<char *>(vt) +
vt->h_offset);
  fp(a);
}

void icall(A *a) {
  a->vtable->i(a);
}

Generated code (x86-64):

0000000000000000 <_Z5fcallP1A>:
   0:    48 8b 07                 mov    (%rdi),%rax
   3:    48 63 08                 movslq (%rax),%rcx
   6:    48 01 c1                 add    %rax,%rcx
   9:    ff e1                    jmpq   *%rcx
   b:    0f 1f 44 00 00           nopl   0x0(%rax,%rax,1)

0000000000000010 <_Z5gcallP1A>:
  10:    48 8b 07                 mov    (%rdi),%rax
  13:    48 63 48 04              movslq 0x4(%rax),%rcx
  17:    48 01 c1                 add    %rax,%rcx
  1a:    ff e1                    jmpq   *%rcx
  1c:    0f 1f 40 00              nopl   0x0(%rax)

0000000000000020 <_Z5hcallP1A>:
  20:    48 8b 07                 mov    (%rdi),%rax
  23:    48 63 48 08              movslq 0x8(%rax),%rcx
  27:    48 01 c1                 add    %rax,%rcx
  2a:    ff e1                    jmpq   *%rcx
  2c:    0f 1f 40 00              nopl   0x0(%rax)

0000000000000030 <_Z5icallP1A>:
  30:    48 8b 07                 mov    (%rdi),%rax
  33:    ff 60 10                 jmpq   *0x10(%rax)

Generated code (Thumb-2):

00000000 <_Z5fcallP1A>:
   0:    6801          ldr    r1, [r0, #0]
   2:    680a          ldr    r2, [r1, #0]
   4:    4411          add    r1, r2
   6:    4708          bx    r1

00000008 <_Z5gcallP1A>:
   8:    6801          ldr    r1, [r0, #0]
   a:    684a          ldr    r2, [r1, #4]
   c:    4411          add    r1, r2
   e:    4708          bx    r1

00000010 <_Z5hcallP1A>:
  10:    6801          ldr    r1, [r0, #0]
  12:    688a          ldr    r2, [r1, #8]
  14:    4411          add    r1, r2
  16:    4708          bx    r1

00000018 <_Z5icallP1A>:
  18:    6801          ldr    r1, [r0, #0]
  1a:    68c9          ldr    r1, [r1, #12]
  1c:    4708          bx    r1

Generated code (ARM64):

0000000000000000 <_Z5fcallP1A>:
   0:    f9400008     ldr    x8, [x0]
   4:    b9800109     ldrsw    x9, [x8]
   8:    8b090101     add    x1, x8, x9
   c:    d61f0020     br    x1

0000000000000010 <_Z5gcallP1A>:
  10:    f9400008     ldr    x8, [x0]
  14:    b9800509     ldrsw    x9, [x8,#4]
  18:    8b090101     add    x1, x8, x9
  1c:    d61f0020     br    x1

0000000000000020 <_Z5hcallP1A>:
  20:    f9400008     ldr    x8, [x0]
  24:    b9800909     ldrsw    x9, [x8,#8]
  28:    8b090101     add    x1, x8, x9
  2c:    d61f0020     br    x1

0000000000000030 <_Z5icallP1A>:
  30:    f9400008     ldr    x8, [x0]
  34:    f9400901     ldr    x1, [x8,#16]
  38:    d61f0020     br    x1

So overall code size penalty for call sites is 1 register (on each
architecture) plus 6 bytes (x86-64), 2 bytes (Thumb-2) or 4 bytes (ARM64) of
instructions per call site. Because all byte displacements would be halved
on 64-bit architectures, costs would be smaller for slots 16-31 on x86-64,
as these can now be called using an 8-bit ModR/M displacement, saving 3 bytes.

I think this will most likely be worth it. There would need to be approximately
5 (x86-64), 6 (Thumb-2) or 7 (ARM64) call sites per vtable slot for the
costs in code size to outweigh the benefits. Because each derived class adds
a new set of slots, the benefit would scale linearly with the size of the
class hierarchy.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160224/6369bd7a/attachment-0001.html>