<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Missed opportuniti4es for register promotion"

   href="https://bugs.llvm.org/show_bug.cgi?id=51193">51193</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Missed opportuniti4es for register promotion

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>momchil.velikov@arm.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Considering this example (<a href="https://gcc.godbolt.org/z/55dfPrc8T">https://gcc.godbolt.org/z/55dfPrc8T</a>), compiled with 

`clang -target aarch64-linux -Ofast`

int u, v;

void f(int a[restrict], int b[restrict], int n) {

    for (int i = 0; i < n; ++i) {

        if (a[i]) {

            ++u;

            break;

        }

        ++u;

        if (b[i])

            ++v;

    }

}

Clang emits 

f:                                      // @f

        cmp     w2, #1                          // =1

        b.lt    .LBB0_7

        adrp    x8, u

        ldr     w10, [x8, :lo12:u]

        mov     w9, w2

        add     w12, w10, #1                    // =1

        adrp    x10, v

        b       .LBB0_3

.LBB0_2:                                //   in Loop: Header=BB0_3 Depth=1

        add     w12, w11, #1                    // =1

        add     x1, x1, #4                      // =4

        subs    x9, x9, #1                      // =1

        add     x0, x0, #4                      // =4

        b.eq    .LBB0_6

.LBB0_3:                                // =>This Inner Loop Header: Depth=1

        ldr     w13, [x0]

        mov     w11, w12

        cbnz    w13, .LBB0_6

        ldr     w12, [x1]

        cbz     w12, .LBB0_2

        ldr     w12, [x10, :lo12:v]

        add     w12, w12, #1                    // =1

        str     w12, [x10, :lo12:v]

        b       .LBB0_2

.LBB0_6:

        str     w11, [x8, :lo12:u]

.LBB0_7:

        ret

where the updates to `u` are performed with a single store instruction after

the loop, for the case the loop body is executed at least once. In contrast,

updates to `v` are performed on each loop iteration, since moving them outside

the loop may introduce traps or data race. 

In the output for the same code, compiled with GCC with `-Ofast` even updates

to `v` are moved out of the loop, since `-Ofast` enables `

-fallow-store-data-races`. But even compiled with `-O2`, the

updates to `v` are still moved outside the loop, only that the store that

writes to `v` is conditional, , i.e:

f:

        cmp     w2, 0

        ble     .L1

        adrp    x10, .LANCHOR0

        add     x11, x10, :lo12:.LANCHOR0

        mov     x3, 0

        mov     w8, 0

        ldr     w9, [x10, #:lo12:.LANCHOR0]

        mov     w7, 0

        ldr     w6, [x11, 4]

        add     w5, w2, w9

        mov     w2, w9

        b       .L7

.L3:

        ldr     w4, [x1, x3]

        add     w2, w2, 1

        add     x3, x3, 4

        mov     w7, 1

        cbz     w4, .L6

        add     w6, w6, w7

        mov     w8, w7

.L6:

        cmp     w2, w5

        beq     .L22

.L7:

        ldr     w4, [x0, x3]

        cbz     w4, .L3

        cbz     w8, .L4

        str     w6, [x11, 4]

.L4:

        cmp     w7, 0

        csel    w9, w9, w2, eq

        add     w9, w9, 1

        str     w9, [x10, #:lo12:.LANCHOR0]

.L1:

        ret

.L22:

        cbz     w8, .L8

        str     w6, [x11, 4]

.L8:

        str     w2, [x10, #:lo12:.LANCHOR0]

        ret

Clang/LLVM could take advantage of a command line option, function or variable

attribute that allows transformations that could potentially introduce data

races. That sounds reasonable for single-threaded programs or for part of code

where absence of data races is guaranteed at a different level of abstraction.

Even without introducing violations of the C/C++/LLVM memory model, LLVM could

hoist the load, e.g. transform

for ... {

  ...

  if cond {

   x0 = *a

   x1 = x0 + 1

   *a = x1

  }

  ...

}

into

x0 = *a

for ... {

  ...

  x1 = phi(x0, x3)

  if cond {

    x2 = x1 + 1

    *a = x2

  }

  x3 = phi(x1, x2)

  ...

}

if `a` is determined to be dereferenceable.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>