[PATCH] D88823: [CodeGen][TailDuplicator] Don't duplicate blocks with INLINEASM_BR

Bill Wendling via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 5 20:21:04 PDT 2020


void added a comment.

Here are the relevant parts of the original issue in `get_partial_node`. Blocks `bb.16.bb100` and `bb.17.bb106` are the sources of the values in the PHI node in `bb.18.bb110`. The `bb.19.bb17.i.i.i` block is the indirect destination and `bb.20.bb113` the default destination.

Before Tail Duplication:

  bb.16.bb100:
  ; predecessors: %bb.15
    successors: %bb.18(0x80000000); %bb.18(100.00%)
  
    MOV64mr %21:gr64, 1, $noreg, 16, $noreg, %7:gr64 :: (store 8 into %ir.17)
    JMP_1 %bb.18
  
  bb.17.bb106:
  ; predecessors: %bb.15
    successors: %bb.18(0x80000000); %bb.18(100.00%)
  
    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
    %56:gr32 = MOV32r0 implicit-def dead $eflags
    $rdi = COPY %19:gr64
    $rsi = COPY %7:gr64
    $edx = COPY %56:gr32
    CALL64pcrel32 @put_cpu_partial, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $edx, implicit-def $rsp, implicit-def $ssp
    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
  
  bb.18.bb110:
  ; predecessors: %bb.17, %bb.16
    successors: %bb.20(0x80000000), %bb.19(0x00000000); %bb.20(100.00%), %bb.19(0.00%)
  
    %16:gr64 = PHI %4:gr64, %bb.17, %13:gr64, %bb.16
    INLINEASM_BR &"1:.byte 0x0f,0x1f,0x44,0x00,0\0A\09.pushsection __jump_table,  \22aw\22 \0A\09 .balign 8 \0A\09.long 1b - ., ${2:l} - . \0A\09 .quad ${0:c} + ${1:c} - .\0A\09.popsection \0A\09" [sideeffect] [mayload] [attdialect], $0:[imm], @slub_debug_enabled, $1:[imm], 0, $2:[imm], blockaddress(@get_partial_node, %ir-block.bb17.i.i.i), $3:[clobber], implicit-def early-clobber $df, $4:[clobber], implicit-def early-clobber $fpsw, $5:[clobber], implicit-def early-clobber $eflags, !11
    JMP_1 %bb.20
  
  bb.19.bb17.i.i.i (address-taken):
  ; predecessors: %bb.18
    successors: %bb.20(0x7c000000), %bb.21(0x04000000); %bb.20(96.88%), %bb.21(3.12%)
  
    TEST32mi %19:gr64, 1, $noreg, 8, $noreg, 2166016, implicit-def $eflags :: (load 4 from %ir.20, align 8)
    JCC_1 %bb.21, 5, implicit $eflags
    JMP_1 %bb.20
  
  bb.20.bb113:
  ; predecessors: %bb.18, %bb.19
    successors: %bb.21(0x04000000), %bb.3(0x7c000000); %bb.21(3.12%), %bb.3(96.88%)
  
    %57:gr32 = MOV32rm %19:gr64, 1, $noreg, 44, $noreg :: (load 4 from %ir.22)
    %58:gr32 = SHR32r1 %57:gr32(tied-def 0), implicit-def dead $eflags
    %59:gr32 = SUB32rr %15:gr32(tied-def 0), killed %58:gr32, implicit-def $eflags
    JCC_1 %bb.3, 6, implicit $eflags
    JMP_1 %bb.21

After Tail Duplication:

  bb.16.bb100:
  ; predecessors: %bb.15
    successors: %bb.20(0x80000000), %bb.19(0x00000000); %bb.20(100.00%), %bb.19(0.00%)
  
    MOV64mr %21:gr64, 1, $noreg, 16, $noreg, %7:gr64 :: (store 8 into %ir.17)
    INLINEASM_BR &"1:.byte 0x0f,0x1f,0x44,0x00,0\0A\09.pushsection __jump_table,  \22aw\22 \0A\09 .balign 8 \0A\09.long 1b - ., ${2:l} - . \0A\09 .quad ${0:c} + ${1:c} - .\0A\09.popsection \0A\09" [sideeffect] [mayload] [attdialect], $0:[imm], @slub_debug_enabled, $1:[imm], 0, $2:[imm], blockaddress(@get_partial_node, %ir-block.bb17.i.i.i), $3:[clobber], implicit-def early-clobber $df, $4:[clobber], implicit-def early-clobber $fpsw, $5:[clobber], implicit-def early-clobber $eflags, !11
    %60:gr64 = COPY %13:gr64
    JMP_1 %bb.20
  
  bb.17.bb106:
  ; predecessors: %bb.15
    successors: %bb.20(0x80000000), %bb.19(0x00000000); %bb.20(100.00%), %bb.19(0.00%)
  
    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
    %56:gr32 = MOV32r0 implicit-def dead $eflags
    $rdi = COPY %19:gr64
    $rsi = COPY %7:gr64
    $edx = COPY %56:gr32
    CALL64pcrel32 @put_cpu_partial, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $edx, implicit-def $rsp, implicit-def $ssp
    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
    INLINEASM_BR &"1:.byte 0x0f,0x1f,0x44,0x00,0\0A\09.pushsection __jump_table,  \22aw\22 \0A\09 .balign 8 \0A\09.long 1b - ., ${2:l} - . \0A\09 .quad ${0:c} + ${1:c} - .\0A\09.popsection \0A\09" [sideeffect] [mayload] [attdialect], $0:[imm], @slub_debug_enabled, $1:[imm], 0, $2:[imm], blockaddress(@get_partial_node, %ir-block.bb17.i.i.i), $3:[clobber], implicit-def early-clobber $df, $4:[clobber], implicit-def early-clobber $fpsw, $5:[clobber], implicit-def early-clobber $eflags, !11
    %61:gr64 = COPY %4:gr64
    JMP_1 %bb.20
  
  bb.19.bb17.i.i.i (address-taken):
  ; predecessors: %bb.16, %bb.17
    successors: %bb.20(0x7c000000), %bb.21(0x04000000); %bb.20(96.88%), %bb.21(3.12%)
  
    %62:gr64 = PHI %60:gr64, %bb.16, %61:gr64, %bb.17
    TEST32mi %19:gr64, 1, $noreg, 8, $noreg, 2166016, implicit-def $eflags :: (load 4 from %ir.20, align 8)
    JCC_1 %bb.21, 5, implicit $eflags
    JMP_1 %bb.20
  
  bb.20.bb113:
  ; predecessors: %bb.19, %bb.16, %bb.17
    successors: %bb.21(0x04000000), %bb.3(0x7c000000); %bb.21(3.12%), %bb.3(96.88%)
  
    %63:gr64 = PHI %62:gr64, %bb.19, %60:gr64, %bb.16, %61:gr64, %bb.17
    %57:gr32 = MOV32rm %19:gr64, 1, $noreg, 44, $noreg :: (load 4 from %ir.22)
    %58:gr32 = SHR32r1 %57:gr32(tied-def 0), implicit-def dead $eflags
    %59:gr32 = SUB32rr %15:gr32(tied-def 0), killed %58:gr32, implicit-def $eflags
    JCC_1 %bb.3, 6, implicit $eflags
    JMP_1 %bb.21

You'll notice that the INLINEASM_BR was duplicated into its predecessors, and `bb.17.bb106` was merged with `bb.18.bb110` (the block containing the original INLINEASM_BR instruction).

Here's the code after PHI elimination:

  bb.16.bb94:
  ; predecessors: %bb.14
    successors: %bb.17(0x30000000), %bb.18(0x50000000); %bb.17(37.50%), %bb.18(62.50%)
  
    TEST64rr %4:gr64, %4:gr64, implicit-def $eflags
    JCC_1 %bb.18, 5, implicit killed $eflags
    JMP_1 %bb.17
  
  bb.17.bb100:
  ; predecessors: %bb.16
    successors: %bb.20(0x80000000), %bb.19(0x00000000); %bb.20(100.00%), %bb.19(0.00%)
  
    MOV64mr %21:gr64, 1, $noreg, 16, $noreg, killed %7:gr64 :: (store 8 into %ir.17)
    INLINEASM_BR &"1:.byte 0x0f,0x1f,0x44,0x00,0\0A\09.pushsection __jump_table,  \22aw\22 \0A\09 .balign 8 \0A\09.long 1b - ., ${2:l} - . \0A\09 .quad ${0:c} + ${1:c} - .\0A\09.popsection \0A\09" [sideeffect] [mayload] [attdialect], $0:[imm], @slub_debug_enabled, $1:[imm], 0, $2:[imm], blockaddress(@get_partial_node, %ir-block.bb17.i.i.i), $3:[clobber], implicit-def dead early-clobber $df, $4:[clobber], implicit-def early-clobber $fpsw, $5:[clobber], implicit-def dead early-clobber $eflags, !11
    %60:gr64 = COPY killed %13:gr64
    %69:gr64 = COPY %60:gr64
    %70:gr64 = COPY killed %60:gr64
    JMP_1 %bb.20
  
  bb.18.bb106:
  ; predecessors: %bb.16
    successors: %bb.20(0x80000000), %bb.19(0x00000000); %bb.20(100.00%), %bb.19(0.00%)
  
    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
    $rdi = COPY %19:gr64
    $rsi = COPY killed %7:gr64
    $edx = COPY %26:gr32
    CALL64pcrel32 @put_cpu_partial, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit killed $edx, implicit-def $rsp, implicit-def $ssp
    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
    INLINEASM_BR &"1:.byte 0x0f,0x1f,0x44,0x00,0\0A\09.pushsection __jump_table,  \22aw\22 \0A\09 .balign 8 \0A\09.long 1b - ., ${2:l} - . \0A\09 .quad ${0:c} + ${1:c} - .\0A\09.popsection \0A\09" [sideeffect] [mayload] [attdialect], $0:[imm], @slub_debug_enabled, $1:[imm], 0, $2:[imm], blockaddress(@get_partial_node, %ir-block.bb17.i.i.i), $3:[clobber], implicit-def dead early-clobber $df, $4:[clobber], implicit-def early-clobber $fpsw, $5:[clobber], implicit-def dead early-clobber $eflags, !11
    %61:gr64 = COPY killed %4:gr64
    %69:gr64 = COPY %61:gr64
    %70:gr64 = COPY killed %61:gr64
    JMP_1 %bb.20
  
  bb.19.bb17.i.i.i (address-taken):
  ; predecessors: %bb.17, %bb.18
    successors: %bb.20(0x7c000000), %bb.21(0x04000000); %bb.20(96.88%), %bb.21(3.12%)
  
    %62:gr64 = COPY killed %69:gr64
    TEST32mi %19:gr64, 1, $noreg, 8, $noreg, 2166016, implicit-def $eflags :: (load 4 from %ir.20, align 8)
    %70:gr64 = COPY %62:gr64
    %71:gr64 = COPY killed %62:gr64
    JCC_1 %bb.21, 5, implicit killed $eflags
    JMP_1 %bb.20
  
  bb.20.bb113:
  ; predecessors: %bb.19, %bb.17, %bb.18
    successors: %bb.21(0x04000000), %bb.5(0x7c000000); %bb.21(3.12%), %bb.5(96.88%)
  
    %63:gr64 = COPY killed %70:gr64
    %42:gr16 = COPY killed %37.sub_16bit:gr32
    %43:gr32 = MOVZX32rr16 killed %42:gr16
    %14:gr32 = nsw SUB32rr killed %41:gr32(tied-def 0), killed %43:gr32, implicit-def dead $eflags
    %15:gr32 = ADD32rr killed %3:gr32(tied-def 0), killed %14:gr32, implicit-def dead $eflags
    %57:gr32 = MOV32rm %19:gr64, 1, $noreg, 44, $noreg :: (load 4 from %ir.22)
    %58:gr32 = SHR32r1 killed %57:gr32(tied-def 0), implicit-def dead $eflags
    CMP32rr %15:gr32, killed %58:gr32, implicit-def $eflags
    %64:gr32 = COPY killed %15:gr32
    %65:gr64 = COPY %63:gr64
    %66:gr64 = COPY killed %8:gr64
    %71:gr64 = COPY killed %63:gr64
    JCC_1 %bb.5, 6, implicit killed $eflags
    JMP_1 %bb.21

As you can see, the PHI elimination pushed the copy instructions into the wrong places. As I wrote to James, we shouldn't be trying to shove those copy instructions before the INLINEASM_BR, because that's contrary to how we normally resolve PHI nodes. I.e., the instruction is executed *only* if that branch is taken. The back-end relies upon this. So if we allow for PHI node resolution from the indirect branch, and the subsequent instructions are placed before the asm goto block, then it's violating that assumption. Not only is the code slower, but it may result in incorrect execution.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D88823/new/

https://reviews.llvm.org/D88823



More information about the llvm-commits mailing list