[PATCH] D150388: [CodeGen]Allow targets to use target specific COPY instructions for live range splitting

Tue Jul 18 15:45:40 PDT 2023

alexfh added a comment.

I'm not sure this llvm-reduce'd IR snippet retains the essence of the problem, but maybe you could look at it and see if there's an obvious issue with the codegen?

  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"

  ; Function Attrs: cold noreturn nounwind
  declare void @llvm.ubsantrap(i8 immarg) #0

  declare i1 @_f1()

  declare { i64, i64 } @_f2(ptr)

  declare { i64, i8 } @_f3()

  declare void @_f4()

  define fastcc void @_f(ptr %0, ptr %1, i64 %2, ptr %3, ptr %4, i1 %5, i1 %6, i24 %7, i1 %8) #1 {
    %10 = call i1 @_f1()
    %11 = icmp eq i24 %7, 0
    br i1 %11, label %13, label %12

  12:                                               ; preds = %9
    call void @_f4()
    br label %common.ret

  common.ret:                                       ; preds = %22, %18, %12
    ret void

  13:                                               ; preds = %20, %9
    %14 = phi i40 [ undef, %9 ], [ %21, %20 ]
    br i1 %6, label %15, label %16

  15:                                               ; preds = %13
    call void @llvm.ubsantrap(i8 0)
    unreachable

  16:                                               ; preds = %13
    %17 = call { i64, i64 } @_f2(ptr %3)
    br i1 %5, label %20, label %18

  18:                                               ; preds = %16
    %19 = and i40 %14, 4294967295
    store ptr null, ptr %0, align 8
    store i40 %19, ptr %1, align 4
    br i1 %8, label %common.ret, label %20

  20:                                               ; preds = %18, %16
    %21 = phi i40 [ %14, %16 ], [ %19, %18 ]
    br i1 %8, label %22, label %13

  22:                                               ; preds = %20
    store ptr null, ptr %0, align 8
    %23 = call { i64, i8 } @_f3()
    %24 = load ptr, ptr %0, align 8
    %25 = icmp eq ptr %24, null
    br i1 %25, label %26, label %common.ret

  26:                                               ; preds = %22
    store volatile i32 0, ptr null, align 4294967296
    unreachable
  }

  attributes #0 = { cold noreturn nounwind }
  attributes #1 = { "frame-pointer"="all" }

The difference in the generated x86-64 assembly (with `clang -O1 -S`, before this patch and after it) is as follows:

          .text
          .file   "reduced.ll"
          .globl  _f                              # -- Begin function _f
          .p2align        4, 0x90
          .type   _f, at function
   _f:                                     # @_f
          .cfi_startproc
   # %bb.0:
          pushq   %rbp
          .cfi_def_cfa_offset 16
          .cfi_offset %rbp, -16
          movq    %rsp, %rbp
          .cfi_def_cfa_register %rbp
          pushq   %r15
          pushq   %r14
          pushq   %r13
          pushq   %r12
          pushq   %rbx
          subq    $24, %rsp
          .cfi_offset %rbx, -56
          .cfi_offset %r12, -48
          .cfi_offset %r13, -40
          .cfi_offset %r14, -32
          .cfi_offset %r15, -24
          movl    %r9d, %r14d
  -       movq    %rcx, %r12
  -       movq    %rsi, %r15
  -       movq    %rdi, -48(%rbp)                 # 8-byte Spill
  +       movq    %rcx, %r15
  +       movq    %rsi, %r12
  +       movq    %rdi, %rbx
          movzbl  32(%rbp), %r13d
  -       movzbl  16(%rbp), %ebx
  +       movzbl  16(%rbp), %eax
  +       movb    %al, -41(%rbp)                  # 1-byte Spill
          callq   _f1 at PLT
          testl   $16777215, 24(%rbp)             # imm = 0xFFFFFF
          je      .LBB0_1
  -# %bb.5:
  +# %bb.9:
          addq    $24, %rsp
          popq    %rbx
          popq    %r12
          popq    %r13
          popq    %r14
          popq    %r15
          popq    %rbp
          .cfi_def_cfa %rsp, 8
          jmp     _f4 at PLT                         # TAILCALL
   .LBB0_1:                                # %.preheader
          .cfi_def_cfa %rbp, 16
  -       movq    %r15, -56(%rbp)                 # 8-byte Spill
  -       movl    %r14d, %r15d
  -       movq    -48(%rbp), %r14                 # 8-byte Reload
  -       testb   $1, %bl
  -       jne     .LBB0_7
  +       movq    %rbx, -56(%rbp)                 # 8-byte Spill
  +       testb   $1, -41(%rbp)                   # 1-byte Folded Reload
  +       jne     .LBB0_11
   # %bb.2:                                # %.preheader.split.preheader
                                           # implicit-def: $rbx
          jmp     .LBB0_3
          .p2align        4, 0x90
  -.LBB0_4:                                #   in Loop: Header=BB0_3 Depth=1
  -       movq    %r14, %rax
  +.LBB0_6:                                #   in Loop: Header=BB0_3 Depth=1
          testb   $1, %r13b
  -       jne     .LBB0_11
  +       jne     .LBB0_7
   .LBB0_3:                                # %.preheader.split
                                           # =>This Inner Loop Header: Depth=1
  -       movq    %r12, %rdi
  +       movq    %r15, %rdi
          callq   _f2 at PLT
  -       testb   $1, %r15b
  -       jne     .LBB0_4
  -# %bb.8:                                #   in Loop: Header=BB0_3 Depth=1
  -       movq    $0, (%r14)
  -       movq    -56(%rbp), %rcx                 # 8-byte Reload
  -       movl    %ebx, (%rcx)
  -       movb    $0, 4(%rcx)
  -       testb   $1, %r13b
  +       testb   $1, %r14b
          jne     .LBB0_6
  -# %bb.9:                                #   in Loop: Header=BB0_3 Depth=1
  -       movq    %r14, %rax
  -       movl    %ebx, %ebx
  -       testb   $1, %r13b
  -       je      .LBB0_3
  -.LBB0_11:
  +# %bb.4:                                #   in Loop: Header=BB0_3 Depth=1
  +       movq    -56(%rbp), %rax                 # 8-byte Reload
          movq    $0, (%rax)
  -       movq    %rax, %rbx
  +       movl    %ebx, (%r12)
  +       movb    $0, 4(%r12)
  +       testb   $1, %r13b
  +       jne     .LBB0_10
  +# %bb.5:                                #   in Loop: Header=BB0_3 Depth=1
  +       movl    %ebx, %ebx
  +       jmp     .LBB0_6
  +.LBB0_7:
  +       movq    -56(%rbp), %rbx                 # 8-byte Reload
  +       movq    $0, (%rbx)
          callq   _f3 at PLT
          cmpq    $0, (%rbx)
  -       je      .LBB0_12
  -.LBB0_6:                                # %common.ret
  +       je      .LBB0_8
  +.LBB0_10:                               # %common.ret
          addq    $24, %rsp
          popq    %rbx
          popq    %r12
          popq    %r13
          popq    %r14
          popq    %r15
          popq    %rbp
          .cfi_def_cfa %rsp, 8
          retq
  -.LBB0_7:
  +.LBB0_11:
          .cfi_def_cfa %rbp, 16
          ud1l    (%eax), %eax
  -.LBB0_12:
  +.LBB0_8:
          movl    $0, 0
   .Lfunc_end0:
          .size   _f, .Lfunc_end0-_f
          .cfi_endproc
                                           # -- End function
          .section        ".note.GNU-stack","", at progbits
          .addrsig

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D150388/new/

https://reviews.llvm.org/D150388