<html>
    <head>
      <base href="https://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - [ppc] slow data reorganization in VSX register (through memory)"
   href="https://llvm.org/bugs/show_bug.cgi?id=27078">27078</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>[ppc] slow data reorganization in VSX register (through memory)
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: PowerPC
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>carrot@google.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Compile following code with options:
 -mvsx -mcpu=power8 -g0 -O2

typedef float Vector3_f[3];

void foo(Vector3_f* blurred_row, int width, float* pixel, float pixel_diff_avg)
{
    for (int j = 0; j < width; ++j, pixel += 3) {
      float* blurred_pixel = blurred_row[j];

      float pixel_diff[3];
      pixel_diff[0] = blurred_pixel[0] - pixel[0];
      pixel_diff[1] = blurred_pixel[1] - pixel[1];
      pixel_diff[2] = blurred_pixel[2] - pixel[2];

      pixel_diff[0] -= pixel_diff_avg;
      pixel_diff[1] -= pixel_diff_avg;
      pixel_diff[2] -= pixel_diff_avg;

      pixel[0] += pixel_diff[0];
      pixel[1] += pixel_diff[1];
      pixel[2] += pixel_diff[2];
    }   
}

LLVM tries to vectorize the loop body, 

        ...
        stxsspx 36, 0, 25            // A1
        xscvspdpn 8, 8
        addi 25, 31, 308 
        xxsldwi 39, 2, 2, 3
        xscvspdpn 10, 10
        ld 28, 88(31)                   # 8-byte Folded Reload
        xxsldwi 38, 0, 0, 1
        xscvspdpn 12, 12
        stxsspx 7, 0, 28             // A2
        xxsldwi 2, 2, 2, 2
        xscvspdpn 4, 39
        ld 28, 80(31)                   # 8-byte Folded Reload
        xxsldwi 3, 3, 3, 1
        xscvspdpn 13, 38
        xxsldwi 42, 0, 0, 3
        stxsspx 9, 0, 28              // A3
        stxsspx 11, 0, 26             // A4
        xscvspdpn 0, 0
        lxvd2x 7, 0, 26               // A5
        ld 24, 72(31)                   # 8-byte Folded Reload
        xscvspdpn 2, 2
        xscvspdpn 6, 42
        stxsspx 32, 0, 24             // B1
        xscvspdpn 3, 3
        addi 24, 31, 304 
        ld 28, 64(31)                   # 8-byte Folded Reload
        xxswapd  11, 7
        stxsspx 35, 0, 28             // B2
        stxsspx 37, 0, 23             // B3
        addi 28, 31, 224 
        stxsspx 33, 0, 22             // B4
        ori 2, 2, 0
        lxvd2x 9, 0, 22               // B5
        stxsspx 41, 0, 21             // C1
        stxsspx 8, 0, 20              // C2
        stxsspx 10, 0, 19             // C3
        stxsspx 12, 0, 18             // C4
        xxswapd  10, 40
        lxvd2x 8, 0, 18               // C5
        stxsspx 43, 0, 17             // D1
        stxsspx 13, 0, 16             // D2
        stxsspx 4, 0, 15              // D3
        stxsspx 5, 0, 14              // D4
        xxswapd  12, 9
        lxvd2x 4, 0, 14               // D5 
        stxsspx 0, 0, 6               // E1
        stxsspx 6, 0, 3               // E2 
        stxsspx 2, 0, 11              // E3 
        stxsspx 3, 0, 28              // E4
        xxswapd  6, 8
        lxvd2x 2, 0, 28               // E5
        ...

A[1..4] arrange 4 fp value in memory, A5 loads it into vector register,
similarly B[1..5], C[1..5], D[1..5], E[1..5] reorganize different values into
vector registers. The problem is the A4 is very close to A5, it triggers the
very slow store forwarding on power8. In perf result, almost all time is
consumes by these loads.

I expect directly shuffling these values in registers is much faster.</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>