[llvm-bugs] [Bug 36993] New: Opportunity to reduce branch usage by merging conditional checks

Tue Apr 3 13:12:23 PDT 2018

https://bugs.llvm.org/show_bug.cgi?id=36993

            Bug ID: 36993
           Summary: Opportunity to reduce branch usage by merging
                    conditional checks
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: listmail at philipreames.com
                CC: llvm-bugs at lists.llvm.org

I noticed that if we have two adjacent conditions of the form typical for
optimized range checks (a lt test between two known positive numbers), we can
rewrite the test by rewriting in forms of a subtract/and idiom.  The basic idea
is that we know a LT test fails for two positive numbers exactly when the
result of (RHS-LHS) is less than zero.  (We know the subtract can't wrap
through INT_MIN since both are positive.)  Beyond LT tests, this can also be
used to merge adjacent null tests as well (since for non-kernel pointers, both
are positive).

The net result is that we can use one fewer branch instruction to perform the
pair of comparisons at the cost of an extra live register through the sub/and
sequence.  The tradeoff here is not obvious, but this may be a useful idiom for
reducing pressure on branch resources.  

The IR below was hand massaged to produce the assembly I think we should be
emitting here.  "test" if our current default.  "alt" is the proposed
alternative.

define i1 @test(i32* %p0) {
entry:
  %p1 = getelementptr i32, i32* %p0, i32 1
  %p2 = getelementptr i32, i32* %p0, i32 2
  %p3 = getelementptr i32, i32* %p0, i32 3
  %v0 = load i32, i32* %p0, !range !{ i32 0, i32 1024 }
  %v1 = load i32, i32* %p1, !range !{ i32 0, i32 1024 }
  %v2 = load i32, i32* %p2, !range !{ i32 0, i32 1024 }
  %v3 = load i32, i32* %p3, !range !{ i32 0, i32 1024 }
  %chk1 = icmp ult i32 %v0, %v1
  %chk2 = icmp ult i32 %v2, %v3
  %and = and i1 %chk1, %chk2
  ret i1 %and
}

define i1 @alt(i32* %p0) {
entry:
  %p1 = getelementptr i32, i32* %p0, i32 1
  %p2 = getelementptr i32, i32* %p0, i32 2
  %p3 = getelementptr i32, i32* %p0, i32 3
  %v0 = load i32, i32* %p0, !range !{ i32 0, i32 1024 }
  %v1 = load i32, i32* %p1, !range !{ i32 0, i32 1024 }
  %v2 = load i32, i32* %p2, !range !{ i32 0, i32 1024 }
  %v3 = load i32, i32* %p3, !range !{ i32 0, i32 1024 }
  %sub1 = sub i32 %v1, %v0
  %sub2 = sub i32 %v3, %v2
  %and = and i32 %sub1, %sub2
  %cmp = icmp slt i32 %and, 0
  ret i1 %cmp
}

declare void @foo()

define void @test_br(i32* %p0) {
entry:
  %p1 = getelementptr i32, i32* %p0, i32 1
  %p2 = getelementptr i32, i32* %p0, i32 2
  %p3 = getelementptr i32, i32* %p0, i32 3
  %v0 = load i32, i32* %p0, !range !{ i32 0, i32 1024 }
  %v1 = load i32, i32* %p1, !range !{ i32 0, i32 1024 }
  %v2 = load i32, i32* %p2, !range !{ i32 0, i32 1024 }
  %v3 = load i32, i32* %p3, !range !{ i32 0, i32 1024 }
  %chk1 = icmp ult i32 %v0, %v1
  %chk2 = icmp ult i32 %v2, %v3
  %and = and i1 %chk1, %chk2
  br i1 %and, label %taken, label %merge
taken:
  call void @foo()
  br label %merge
merge:
  ret void
}

define void @alt_br(i32* %p0) {
entry:
  %p1 = getelementptr i32, i32* %p0, i32 1
  %p2 = getelementptr i32, i32* %p0, i32 2
  %p3 = getelementptr i32, i32* %p0, i32 3
  %v0 = load i32, i32* %p0, !range !{ i32 0, i32 1024 }
  %v1 = load i32, i32* %p1, !range !{ i32 0, i32 1024 }
  %v2 = load i32, i32* %p2, !range !{ i32 0, i32 1024 }
  %v3 = load i32, i32* %p3, !range !{ i32 0, i32 1024 }
  %sub1 = sub i32 %v1, %v0
  %sub2 = sub i32 %v3, %v2
  %and = and i32 %sub1, %sub2
  %cmp = icmp slt i32 %and, 0
  br i1 %cmp, label %taken, label %merge
taken:
  call void @foo()
  br label %merge
merge:
  ret void
}

Output (haswell, but pretty similar on all micros tried)

        .text
        .file   "<stdin>"
        .globl  test                    # -- Begin function test
        .p2align        4, 0x90
        .type   test, at function
test:                                   # @test
        .cfi_startproc
# %bb.0:                                # %entry
        movl    (%rdi), %eax
        movl    8(%rdi), %ecx
        cmpl    4(%rdi), %eax
        setb    %dl
        cmpl    12(%rdi), %ecx
        setb    %al
        andb    %dl, %al
        retq
.Lfunc_end0:
        .size   test, .Lfunc_end0-test
        .cfi_endproc
                                        # -- End function
        .globl  alt                     # -- Begin function alt
        .p2align        4, 0x90
        .type   alt, at function
alt:                                    # @alt
        .cfi_startproc
# %bb.0:                                # %entry
        movl    4(%rdi), %ecx
        movl    12(%rdi), %eax
        subl    (%rdi), %ecx
        subl    8(%rdi), %eax
        andl    %ecx, %eax
        shrl    $31, %eax
                                        # kill: def $al killed $al killed $eax
        retq
.Lfunc_end1:
        .size   alt, .Lfunc_end1-alt
        .cfi_endproc
                                        # -- End function
        .globl  test_br                 # -- Begin function test_br
        .p2align        4, 0x90
        .type   test_br, at function
test_br:                                # @test_br
        .cfi_startproc
# %bb.0:                                # %entry
        pushq   %rax
        .cfi_def_cfa_offset 16
        movl    (%rdi), %eax
        cmpl    4(%rdi), %eax
        jae     .LBB2_3
# %bb.1:                                # %entry
        movl    12(%rdi), %eax
        cmpl    %eax, 8(%rdi)
        jae     .LBB2_3
# %bb.2:                                # %taken
        callq   foo
.LBB2_3:                                # %merge
        popq    %rax
        retq
.Lfunc_end2:
        .size   test_br, .Lfunc_end2-test_br
        .cfi_endproc
                                        # -- End function
        .globl  alt_br                  # -- Begin function alt_br
        .p2align        4, 0x90
        .type   alt_br, at function
alt_br:                                 # @alt_br
        .cfi_startproc
# %bb.0:                                # %entry
        pushq   %rax
        .cfi_def_cfa_offset 16
        movl    4(%rdi), %eax
        subl    (%rdi), %eax
        movl    12(%rdi), %ecx
        subl    8(%rdi), %ecx
        testl   %ecx, %eax
        js      .LBB3_1
# %bb.2:                                # %merge
        popq    %rax
        retq
.LBB3_1:                                # %taken
        callq   foo
        popq    %rax
        retq
.Lfunc_end3:
        .size   alt_br, .Lfunc_end3-alt_br
        .cfi_endproc
                                        # -- End function

        .section        ".note.GNU-stack","", at progbits

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20180403/f1fc3474/attachment-0001.html>