<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Vectorized code degrades performance 3x swicthing from SSE4.1 to SSE4.2"

   href="https://bugs.llvm.org/show_bug.cgi?id=42152">42152</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Vectorized code degrades performance 3x swicthing from SSE4.1 to SSE4.2

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Loop Optimizer

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>spreis@yandex-team.ru

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>For the code below (<a href="https://gcc.godbolt.org/z/Z3JgG6">https://gcc.godbolt.org/z/Z3JgG6</a>) compilation with SSE4.2

produces 3x slower code that with just SSE4.1.

The code for SSE4.1 is pretty straightforwardly vectorized i32x2 and unrolled

by 2 producing clean and understandable code.

For SSE4.2 the code is again vectorized i32x2 and unrolled by 2, but some

optimization fuses series of nice load <2 x i32> scattered among the loop

(interleaved with compute) code into huge block of

  %64 = bitcast i32* %60 to <8 x i32>*

  %65 = bitcast i32* %63 to <8 x i32>*

  %66 = load <8 x i32>, <8 x i32>* %64, align 4, !dbg !46, !tbaa !48

  %67 = load <8 x i32>, <8 x i32>* %65, align 4, !dbg !46, !tbaa !48

  %68 = shufflevector <8 x i32> %66, <8 x i32> undef, <2 x i32> <i32 0, i32 4>,

!dbg !46

  %69 = shufflevector <8 x i32> %67, <8 x i32> undef, <2 x i32> <i32 0, i32 4>,

!dbg !46

  %70 = shufflevector <8 x i32> %66, <8 x i32> undef, <2 x i32> <i32 1, i32 5>,

!dbg !46

  %71 = shufflevector <8 x i32> %67, <8 x i32> undef, <2 x i32> <i32 1, i32 5>,

!dbg !46

  %72 = shufflevector <8 x i32> %66, <8 x i32> undef, <2 x i32> <i32 2, i32 6>,

!dbg !46

  %73 = shufflevector <8 x i32> %67, <8 x i32> undef, <2 x i32> <i32 2, i32 6>,

!dbg !46

  %74 = shufflevector <8 x i32> %66, <8 x i32> undef, <2 x i32> <i32 3, i32 7>,

!dbg !46

  %75 = shufflevector <8 x i32> %67, <8 x i32> undef, <2 x i32> <i32 3, i32 7>,

!dbg !46

  %76 = bitcast i32* %55 to <8 x i32>*

  %77 = bitcast i32* %58 to <8 x i32>*

  %78 = load <8 x i32>, <8 x i32>* %76, align 4, !dbg !52, !tbaa !48

  %79 = load <8 x i32>, <8 x i32>* %77, align 4, !dbg !52, !tbaa !48

  %80 = shufflevector <8 x i32> %78, <8 x i32> undef, <2 x i32> <i32 0, i32 4>,

!dbg !52

  %81 = shufflevector <8 x i32> %79, <8 x i32> undef, <2 x i32> <i32 0, i32 4>,

!dbg !52

  %82 = shufflevector <8 x i32> %78, <8 x i32> undef, <2 x i32> <i32 1, i32 5>,

!dbg !52

  %83 = shufflevector <8 x i32> %79, <8 x i32> undef, <2 x i32> <i32 1, i32 5>,

!dbg !52

  %84 = shufflevector <8 x i32> %78, <8 x i32> undef, <2 x i32> <i32 2, i32 6>,

!dbg !52

  %85 = shufflevector <8 x i32> %79, <8 x i32> undef, <2 x i32> <i32 2, i32 6>,

!dbg !52

  %86 = shufflevector <8 x i32> %78, <8 x i32> undef, <2 x i32> <i32 3, i32 7>,

!dbg !52

  %87 = shufflevector <8 x i32> %79, <8 x i32> undef, <2 x i32> <i32 3, i32 7>,

!dbg !52

Each shuffle is than lowered into 4 incluctions:

        psllq   $32, %xmm6

        pshufd  $245, %xmm6, %xmm0      # xmm0 = xmm6[1,1,3,3]

        psrad   $31, %xmm6

        pblendw $51, %xmm0, %xmm6       # xmm6 =

xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]

Those double number of instructions in a loop and significantly increase

register pressure. It seems that something wrong with the cost model for this

optimization. I hardly believe that such transformation can be ever profitable

with SSE4 if all: it provides 2x improvement on loads, but shuffles seem to be

too costly. 

---

The code:

    template <typename T, typename R = T>

    R AbsDiff(T a, T b) {

        if (a < b)

            return (R)b - (R)a;

        return (R)a - (R)b;

    }

    template <typename Number, typename Result = unsigned long long>

    Result L1DistanceImpl(const Number* lhs, const Number* rhs, int length) {

        Result s0 = 0;

        Result s1 = 0;

        Result s2 = 0;

        Result s3 = 0;

        while (length >= 4) {

            s0 += AbsDiff(lhs[0], rhs[0]);

            s1 += AbsDiff(lhs[1], rhs[1]);

            s2 += AbsDiff(lhs[2], rhs[2]);

            s3 += AbsDiff(lhs[3], rhs[3]);

            length -= 4;

            lhs += 4;

            rhs += 4;

        }

        while (length) {

            s0 += AbsDiff(*lhs++, *rhs++);

            --length;

        }

        return s0 + s1 + s2 + s3;

    }

unsigned long long L1Distance(const int* lhs, const int* rhs, int length) {

    return L1DistanceImpl<int>(lhs, rhs, length);

}</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>