<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Improve code size with a more space-efficient vtable ABI"

   href="https://llvm.org/bugs/show_bug.cgi?id=26723">26723</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Improve code size with a more space-efficient vtable ABI

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Interprocedural Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>peter@pcc.me.uk

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>A more space-efficient representation for virtual tables would store 32-bit

address-point-relative offsets to virtual functions instead of virtual

function pointers. By using this representation, we avoid the need to emit

dynamic relocations for the virtual function pointers. On ELF, taking into

account the size of the dynamic relocation entry, this would save 32-4=28

bytes per virtual table slot on 64-bit architectures, or 16-4=12 bytes per

slot on 32-bit architectures.

This is an ABI break, but we can practically deploy it with whole program

visibility, which can be assured at LTO time using something along the lines

of the bitset metadata currently produced by -fwhole-program-vtables.

By avoiding these dynamic relocations, this would also save the program some

startup time, and when RTTI is disabled there would be no need to store any

dynamically relocated pointers in the vtable region, so the vtable pages could

be shared among processes.

See also PR24782. I suspect that vtables are writable on Mac because of

these dynamic relocations.

Code generation example:

#include <stdint.h>

struct A;

struct A_vtable {

  int32_t f_offset;

  int32_t g_offset;

  int32_t h_offset;

  void (*i)(A *);

};

struct A {

  A_vtable *vtable;

};

void fcall(A *a) {

  auto vt = a->vtable;

  auto fp = reinterpret_cast<void (*)(A *)>(reinterpret_cast<char *>(vt) +

vt->f_offset);

  fp(a);

}

void gcall(A *a) {

  auto vt = a->vtable;

  auto fp = reinterpret_cast<void (*)(A *)>(reinterpret_cast<char *>(vt) +

vt->g_offset);

  fp(a);

}

void hcall(A *a) {

  auto vt = a->vtable;

  auto fp = reinterpret_cast<void (*)(A *)>(reinterpret_cast<char *>(vt) +

vt->h_offset);

  fp(a);

}

void icall(A *a) {

  a->vtable->i(a);

}

Generated code (x86-64):

0000000000000000 <_Z5fcallP1A>:

   0:    48 8b 07                 mov    (%rdi),%rax

   3:    48 63 08                 movslq (%rax),%rcx

   6:    48 01 c1                 add    %rax,%rcx

   9:    ff e1                    jmpq   *%rcx

   b:    0f 1f 44 00 00           nopl   0x0(%rax,%rax,1)

0000000000000010 <_Z5gcallP1A>:

  10:    48 8b 07                 mov    (%rdi),%rax

  13:    48 63 48 04              movslq 0x4(%rax),%rcx

  17:    48 01 c1                 add    %rax,%rcx

  1a:    ff e1                    jmpq   *%rcx

  1c:    0f 1f 40 00              nopl   0x0(%rax)

0000000000000020 <_Z5hcallP1A>:

  20:    48 8b 07                 mov    (%rdi),%rax

  23:    48 63 48 08              movslq 0x8(%rax),%rcx

  27:    48 01 c1                 add    %rax,%rcx

  2a:    ff e1                    jmpq   *%rcx

  2c:    0f 1f 40 00              nopl   0x0(%rax)

0000000000000030 <_Z5icallP1A>:

  30:    48 8b 07                 mov    (%rdi),%rax

  33:    ff 60 10                 jmpq   *0x10(%rax)

Generated code (Thumb-2):

00000000 <_Z5fcallP1A>:

   0:    6801          ldr    r1, [r0, #0]

   2:    680a          ldr    r2, [r1, #0]

   4:    4411          add    r1, r2

   6:    4708          bx    r1

00000008 <_Z5gcallP1A>:

   8:    6801          ldr    r1, [r0, #0]

   a:    684a          ldr    r2, [r1, #4]

   c:    4411          add    r1, r2

   e:    4708          bx    r1

00000010 <_Z5hcallP1A>:

  10:    6801          ldr    r1, [r0, #0]

  12:    688a          ldr    r2, [r1, #8]

  14:    4411          add    r1, r2

  16:    4708          bx    r1

00000018 <_Z5icallP1A>:

  18:    6801          ldr    r1, [r0, #0]

  1a:    68c9          ldr    r1, [r1, #12]

  1c:    4708          bx    r1

Generated code (ARM64):

0000000000000000 <_Z5fcallP1A>:

   0:    f9400008     ldr    x8, [x0]

   4:    b9800109     ldrsw    x9, [x8]

   8:    8b090101     add    x1, x8, x9

   c:    d61f0020     br    x1

0000000000000010 <_Z5gcallP1A>:

  10:    f9400008     ldr    x8, [x0]

  14:    b9800509     ldrsw    x9, [x8,#4]

  18:    8b090101     add    x1, x8, x9

  1c:    d61f0020     br    x1

0000000000000020 <_Z5hcallP1A>:

  20:    f9400008     ldr    x8, [x0]

  24:    b9800909     ldrsw    x9, [x8,#8]

  28:    8b090101     add    x1, x8, x9

  2c:    d61f0020     br    x1

0000000000000030 <_Z5icallP1A>:

  30:    f9400008     ldr    x8, [x0]

  34:    f9400901     ldr    x1, [x8,#16]

  38:    d61f0020     br    x1

So overall code size penalty for call sites is 1 register (on each

architecture) plus 6 bytes (x86-64), 2 bytes (Thumb-2) or 4 bytes (ARM64) of

instructions per call site. Because all byte displacements would be halved

on 64-bit architectures, costs would be smaller for slots 16-31 on x86-64,

as these can now be called using an 8-bit ModR/M displacement, saving 3 bytes.

I think this will most likely be worth it. There would need to be approximately

5 (x86-64), 6 (Thumb-2) or 7 (ARM64) call sites per vtable slot for the

costs in code size to outweigh the benefits. Because each derived class adds

a new set of slots, the benefit would scale linearly with the size of the

class hierarchy.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>