[llvm-bugs] [Bug 51193] New: Missed opportuniti4es for register promotion

Fri Jul 23 12:56:45 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=51193

            Bug ID: 51193
           Summary: Missed opportuniti4es for register promotion
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: momchil.velikov at arm.com
                CC: llvm-bugs at lists.llvm.org

Considering this example (https://gcc.godbolt.org/z/55dfPrc8T), compiled with 
`clang -target aarch64-linux -Ofast`

int u, v;

void f(int a[restrict], int b[restrict], int n) {
    for (int i = 0; i < n; ++i) {
        if (a[i]) {
            ++u;
            break;
        }
        ++u;
        if (b[i])
            ++v;
    }
}

Clang emits 

f:                                      // @f
        cmp     w2, #1                          // =1
        b.lt    .LBB0_7
        adrp    x8, u
        ldr     w10, [x8, :lo12:u]
        mov     w9, w2
        add     w12, w10, #1                    // =1
        adrp    x10, v
        b       .LBB0_3
.LBB0_2:                                //   in Loop: Header=BB0_3 Depth=1
        add     w12, w11, #1                    // =1
        add     x1, x1, #4                      // =4
        subs    x9, x9, #1                      // =1
        add     x0, x0, #4                      // =4
        b.eq    .LBB0_6
.LBB0_3:                                // =>This Inner Loop Header: Depth=1
        ldr     w13, [x0]
        mov     w11, w12
        cbnz    w13, .LBB0_6
        ldr     w12, [x1]
        cbz     w12, .LBB0_2
        ldr     w12, [x10, :lo12:v]
        add     w12, w12, #1                    // =1
        str     w12, [x10, :lo12:v]
        b       .LBB0_2
.LBB0_6:
        str     w11, [x8, :lo12:u]
.LBB0_7:
        ret

where the updates to `u` are performed with a single store instruction after
the loop, for the case the loop body is executed at least once. In contrast,
updates to `v` are performed on each loop iteration, since moving them outside
the loop may introduce traps or data race. 

In the output for the same code, compiled with GCC with `-Ofast` even updates
to `v` are moved out of the loop, since `-Ofast` enables `
-fallow-store-data-races`. But even compiled with `-O2`, the
updates to `v` are still moved outside the loop, only that the store that
writes to `v` is conditional, , i.e:

f:
        cmp     w2, 0
        ble     .L1
        adrp    x10, .LANCHOR0
        add     x11, x10, :lo12:.LANCHOR0
        mov     x3, 0
        mov     w8, 0
        ldr     w9, [x10, #:lo12:.LANCHOR0]
        mov     w7, 0
        ldr     w6, [x11, 4]
        add     w5, w2, w9
        mov     w2, w9
        b       .L7
.L3:
        ldr     w4, [x1, x3]
        add     w2, w2, 1
        add     x3, x3, 4
        mov     w7, 1
        cbz     w4, .L6
        add     w6, w6, w7
        mov     w8, w7
.L6:
        cmp     w2, w5
        beq     .L22
.L7:
        ldr     w4, [x0, x3]
        cbz     w4, .L3
        cbz     w8, .L4
        str     w6, [x11, 4]
.L4:
        cmp     w7, 0
        csel    w9, w9, w2, eq
        add     w9, w9, 1
        str     w9, [x10, #:lo12:.LANCHOR0]
.L1:
        ret
.L22:
        cbz     w8, .L8
        str     w6, [x11, 4]
.L8:
        str     w2, [x10, #:lo12:.LANCHOR0]
        ret

Clang/LLVM could take advantage of a command line option, function or variable
attribute that allows transformations that could potentially introduce data
races. That sounds reasonable for single-threaded programs or for part of code
where absence of data races is guaranteed at a different level of abstraction.

Even without introducing violations of the C/C++/LLVM memory model, LLVM could
hoist the load, e.g. transform

for ... {
  ...
  if cond {
   x0 = *a
   x1 = x0 + 1
   *a = x1
  }
  ...
}

into

x0 = *a
for ... {
  ...
  x1 = phi(x0, x3)
  if cond {
    x2 = x1 + 1
    *a = x2
  }
  x3 = phi(x1, x2)
  ...
}

if `a` is determined to be dereferenceable.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210723/aa9cc27e/attachment-0001.html>