<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [ppc] slow data reorganization in VSX register (through memory)"

   href="https://llvm.org/bugs/show_bug.cgi?id=27078">27078</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[ppc] slow data reorganization in VSX register (through memory)

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: PowerPC

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>carrot@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Compile following code with options:

 -mvsx -mcpu=power8 -g0 -O2

typedef float Vector3_f[3];

void foo(Vector3_f* blurred_row, int width, float* pixel, float pixel_diff_avg)

{

    for (int j = 0; j < width; ++j, pixel += 3) {

      float* blurred_pixel = blurred_row[j];

      float pixel_diff[3];

      pixel_diff[0] = blurred_pixel[0] - pixel[0];

      pixel_diff[1] = blurred_pixel[1] - pixel[1];

      pixel_diff[2] = blurred_pixel[2] - pixel[2];

      pixel_diff[0] -= pixel_diff_avg;

      pixel_diff[1] -= pixel_diff_avg;

      pixel_diff[2] -= pixel_diff_avg;

      pixel[0] += pixel_diff[0];

      pixel[1] += pixel_diff[1];

      pixel[2] += pixel_diff[2];

    }   

}

LLVM tries to vectorize the loop body, 

        ...

        stxsspx 36, 0, 25            // A1

        xscvspdpn 8, 8

        addi 25, 31, 308 

        xxsldwi 39, 2, 2, 3

        xscvspdpn 10, 10

        ld 28, 88(31)                   # 8-byte Folded Reload

        xxsldwi 38, 0, 0, 1

        xscvspdpn 12, 12

        stxsspx 7, 0, 28             // A2

        xxsldwi 2, 2, 2, 2

        xscvspdpn 4, 39

        ld 28, 80(31)                   # 8-byte Folded Reload

        xxsldwi 3, 3, 3, 1

        xscvspdpn 13, 38

        xxsldwi 42, 0, 0, 3

        stxsspx 9, 0, 28              // A3

        stxsspx 11, 0, 26             // A4

        xscvspdpn 0, 0

        lxvd2x 7, 0, 26               // A5

        ld 24, 72(31)                   # 8-byte Folded Reload

        xscvspdpn 2, 2

        xscvspdpn 6, 42

        stxsspx 32, 0, 24             // B1

        xscvspdpn 3, 3

        addi 24, 31, 304 

        ld 28, 64(31)                   # 8-byte Folded Reload

        xxswapd  11, 7

        stxsspx 35, 0, 28             // B2

        stxsspx 37, 0, 23             // B3

        addi 28, 31, 224 

        stxsspx 33, 0, 22             // B4

        ori 2, 2, 0

        lxvd2x 9, 0, 22               // B5

        stxsspx 41, 0, 21             // C1

        stxsspx 8, 0, 20              // C2

        stxsspx 10, 0, 19             // C3

        stxsspx 12, 0, 18             // C4

        xxswapd  10, 40

        lxvd2x 8, 0, 18               // C5

        stxsspx 43, 0, 17             // D1

        stxsspx 13, 0, 16             // D2

        stxsspx 4, 0, 15              // D3

        stxsspx 5, 0, 14              // D4

        xxswapd  12, 9

        lxvd2x 4, 0, 14               // D5 

        stxsspx 0, 0, 6               // E1

        stxsspx 6, 0, 3               // E2 

        stxsspx 2, 0, 11              // E3 

        stxsspx 3, 0, 28              // E4

        xxswapd  6, 8

        lxvd2x 2, 0, 28               // E5

        ...

A[1..4] arrange 4 fp value in memory, A5 loads it into vector register,

similarly B[1..5], C[1..5], D[1..5], E[1..5] reorganize different values into

vector registers. The problem is the A4 is very close to A5, it triggers the

very slow store forwarding on power8. In perf result, almost all time is

consumes by these loads.

I expect directly shuffling these values in registers is much faster.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>