[PATCH] D132073: [CodeGen] Zero out only modified registers

Thu Sep 1 15:35:54 PDT 2022

void added a comment.

In D132073#3765433 <https://reviews.llvm.org/D132073#3765433>, @efriedma wrote:

> In D132073#3765379 <https://reviews.llvm.org/D132073#3765379>, @void wrote:
>
>> In D132073#3765159 <https://reviews.llvm.org/D132073#3765159>, @efriedma wrote:
>>
>>>> we shouldn't be zeroing %rdi
>>>
>>> I don't see how it's a problem if we zero rdi.  Even if we don't modify rdi, the caller doesn't have any way of knowing it wasn't modified.  The calling convention says it's caller-save.
>>
>> This is just a test case. It's not meant to be definitive of the original bug. Just show the same symptoms.
>
> The thing is, I don't see the connection between this "symptom" and any possible bug.  rdi is callee-clobbered, and the caller has no way of knowing whether the implementation actually writes to rdi, therefore it can't be a bug to add a write to rdi.

I understand. Here's an example of what's going on, at least with the Linux kernel. A function is defined like this:

  u64 __attribute__((no_instrument_function)) _paravirt_ident_64(u64 x)
  {
          return x;
  }

  /* ... */

  struct paravirt_patch_template pv_ops = {
    /* ... */
    .mmu.pmd_val = ((struct paravirt_callee_save) { _paravirt_ident_64 }),
    /* ... */
  };

The generated code is like this:

          .text
          .globl  _paravirt_ident_64              # -- Begin function _paravirt_ident_64
          .p2align        4, 0x90
          .type   _paravirt_ident_64, at function
  _paravirt_ident_64:                     # @_paravirt_ident_64
  .Lfunc_begin2:
          .loc    0 85 0                          # arch/x86/kernel/paravirt.c:85:0
          .cfi_startproc
  # %bb.0:
          #DEBUG_VALUE: _paravirt_ident_64:x <- $rdi
          movq    %rdi, %rax
  .Ltmp22:
          .loc    0 86 2 prologue_end             # arch/x86/kernel/paravirt.c:86:2
          xorl    %edi, %edi
  .Ltmp23:
          #DEBUG_VALUE: _paravirt_ident_64:x <- $rax
          jmp     __x86_return_thunk              # TAILCALL
  .Ltmp24:
  .Lfunc_end2:
          .size   _paravirt_ident_64, .Lfunc_end2-_paravirt_ident_64
          .cfi_endproc

As you mentioned, this *should* be okay, as `%rdi` is caller-saved. However, when it's used in a call, like this:

  static inline __attribute__((__gnu_inline__)) __attribute__((__unused__))
  __attribute__((no_instrument_function)) pmdval_t
  native_pmd_val(pmd_t pmd)
  {
          return pmd.pmd;
  }       

  static inline __attribute__((__gnu_inline__)) __attribute__((__unused__))
  __attribute__((no_instrument_function)) int
  pmd_none(pmd_t pmd)
  {
          unsigned long val = native_pmd_val(pmd);
          return (val & ~((((pteval_t)(1)) << 6) | (((pteval_t)(1)) << 5))) == 0;
  }

  int pmd_huge(pmd_t pmd)
  {
          return !pmd_none(pmd) && (pmd_val(pmd) & ((((pteval_t)(1)) << 0) |
                                                    (((pteval_t)(1)) << 7))) !=
                                           (((pteval_t)(1)) << 0);
  }                                       

The code generated is something like this:

          .globl  pmd_huge                        # -- Begin function pmd_huge
          .p2align        4, 0x90
          .type   pmd_huge, at function
  pmd_huge:                               # @pmd_huge
  .Lfunc_begin0:
          .loc    0 28 0                          # arch/x86/mm/hugetlbpage.c:28:0
          .cfi_sections .debug_frame
          .cfi_startproc
  # %bb.0:
          callq   __fentry__
  .Ltmp1:
          #DEBUG_VALUE: pmd_huge:pmd <- $rdi
          #DEBUG_VALUE: pmd_none:pmd <- $rdi
          #DEBUG_VALUE: pmd_none:val <- $rdi
          .file   161 "./arch/x86/include/asm" "pgtable.h" md5 0x203fee1b33aab4131e5eb70cbeeb9a3d
          .loc    161 793 41 prologue_end         # ./arch/x86/include/asm/pgtable.h:793:41
          testq   $-97, %rdi
  .Ltmp2:
          .loc    0 29 24                         # arch/x86/mm/hugetlbpage.c:29:24
          je      .LBB0_1
  .Ltmp3:
  # %bb.2:
          #DEBUG_VALUE: pmd_huge:pmd <- $rdi
          #DEBUG_VALUE: pmd_val:pmd <- $rdi
          #DEBUG_VALUE: __edi <- undef
          #DEBUG_VALUE: __esi <- undef
          #DEBUG_VALUE: __edx <- undef
          #DEBUG_VALUE: __ecx <- undef
          #DEBUG_VALUE: __eax <- undef
          .file   162 "./arch/x86/include/asm" "paravirt.h" md5 0x1f771689a20be93009d7ef8574461232
          .loc    162 458 9                       # ./arch/x86/include/asm/paravirt.h:458:9
          #APP
          # ALT: oldnstr
  .Ltmp4:
  .Ltmp5:
  .Ltmp6:
          .section        .discard.retpoline_safe,"", at progbits
          .quad   .Ltmp6
          .text

          callq   *pv_ops+536(%rip)

  .Ltmp7:
          .section        .parainstructions,"a", at progbits
          .p2align        3, 0x0
          .quad   .Ltmp5
          .byte   67
          .byte   .Ltmp7-.Ltmp5
          .short  1
          .text

  .Ltmp8:
          # ALT: padding
          .zero   (-(((.Ltmp9-.Ltmp10)-(.Ltmp8-.Ltmp4))>0))*((.Ltmp9-.Ltmp10)-(.Ltmp8-.Ltmp4)),144
  .Ltmp11:
          .section        .altinstructions,"a", at progbits
  .Ltmp12:
          .long   .Ltmp4-.Ltmp12
  .Ltmp13:
          .long   .Ltmp10-.Ltmp13
          .short  33040
          .byte   .Ltmp11-.Ltmp4
          .byte   .Ltmp9-.Ltmp10
          .text

          .section        .altinstr_replacement,"ax", at progbits
          # ALT: replacement 1
  .Ltmp10:
          movq    %rdi, %rax
  .Ltmp9:
          .text

          #NO_APP
          movq    %rax, %rcx
  .Ltmp14:
          #DEBUG_VALUE: __eax <- $rax
          .loc    0 30 17                         # arch/x86/mm/hugetlbpage.c:30:17
          andl    $129, %ecx
          .loc    0 30 46 is_stmt 0               # arch/x86/mm/hugetlbpage.c:30:46
          xorl    %eax, %eax
  .Ltmp15:
          cmpl    $1, %ecx
          setne   %al
          jmp     .LBB0_3
  .Ltmp16:
  .LBB0_1:
          #DEBUG_VALUE: pmd_huge:pmd <- $rdi
          #DEBUG_VALUE: pmd_none:pmd <- $rdi
          #DEBUG_VALUE: pmd_none:val <- $rdi
          .loc    0 0 46                          # arch/x86/mm/hugetlbpage.c:0:46
          xorl    %eax, %eax
  .Ltmp17:
  .LBB0_3:
          #DEBUG_VALUE: pmd_huge:pmd <- $rdi
          .loc    0 29 2 is_stmt 1                # arch/x86/mm/hugetlbpage.c:29:2
          xorl    %ecx, %ecx
          xorl    %edi, %edi
  .Ltmp18:
          #DEBUG_VALUE: pmd_huge:pmd <- [DW_OP_LLVM_entry_value 1] $rdi
          jmp     __x86_return_thunk              # TAILCALL
  .Ltmp19:
  .Lfunc_end0:
          .size   pmd_huge, .Lfunc_end0-pmd_huge
          .cfi_endproc
                                          # -- End function

The important bit is the `movq %rdi, %rax` right after the `callq *pv_ops+536(%rip)` instruction. The `rdi` register isn't preserved across the call, and is in fact zeroed out so the return value, which was already placed in `rax` by `pmd_val`, is overwritten by zero. My thinking was that we don't want to zero out registers that aren't modified in the called function, because they're not expected to be stomped on. Perhaps there's something else going on?

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132073/new/

https://reviews.llvm.org/D132073