<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Poor extension of extracted integer from vectors"

   href="https://bugs.llvm.org/show_bug.cgi?id=34209">34209</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Poor extension of extracted integer from vectors

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>llvm-dev@redking.me.uk

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org, spatel+llvm@rotateright.com

          </td>

        </tr></table>

      <p>

        <div>

        <pre>In the gather+swizzle+store pattern below we are extracting the <4 x i32>

indices as a <2 x i64> and then splitting each element into <2 x i32>.

This is fine and might reduce the pressure on the fpu-integer pipe, but we are

performing a ASHR+MOV to extract the upper i32 bits instead of just a LSHR.

Hopefully this is just missing a DAGCombine pattern, but I haven't

investigated.

<a href="https://godbolt.org/g/JAUnqy">https://godbolt.org/g/JAUnqy</a>

#include <x86intrin.h>

#include <stdint.h>

void gather_swizzle(const __m128i *indices, float *buffer)

{

  __m128i idx = *indices++;

  uint32_t idx0 = static_cast<uint32_t>(_mm_extract_epi32(idx, 0));

  uint32_t idx1 = static_cast<uint32_t>(_mm_extract_epi32(idx, 1));

  uint32_t idx2 = static_cast<uint32_t>(_mm_extract_epi32(idx, 2));

  uint32_t idx3 = static_cast<uint32_t>(_mm_extract_epi32(idx, 3));

  float sum0 = buffer[idx0];

  float sum1 = buffer[idx1];

  float sum2 = buffer[idx2];

  float sum3 = buffer[idx3];

  buffer[0] = sum0;

  buffer[1] = sum1;

  buffer[2] = sum2;

  buffer[3] = sum3;

}

llvm -mcpu=btver2 -mtriple=x86_64-unknown

define void @gather_swizzle((<2 x i64>* nocapture readonly, float* nocapture)

local_unnamed_addr #0 {

  %3 = bitcast <2 x i64>* %0 to <4 x i32>*

  %4 = load <4 x i32>, <4 x i32>* %3, align 16

  %5 = extractelement <4 x i32> %4, i32 0

  %6 = extractelement <4 x i32> %4, i32 1

  %7 = extractelement <4 x i32> %4, i32 2

  %8 = extractelement <4 x i32> %4, i32 3

  %9 = zext i32 %5 to i64

  %10 = getelementptr inbounds float, float* %1, i64 %9

  %11 = bitcast float* %10 to i32*

  %12 = load i32, i32* %11, align 4

  %13 = zext i32 %6 to i64

  %14 = getelementptr inbounds float, float* %1, i64 %13

  %15 = bitcast float* %14 to i32*

  %16 = load i32, i32* %15, align 4

  %17 = zext i32 %7 to i64

  %18 = getelementptr inbounds float, float* %1, i64 %17

  %19 = bitcast float* %18 to i32*

  %20 = load i32, i32* %19, align 4

  %21 = zext i32 %8 to i64

  %22 = getelementptr inbounds float, float* %1, i64 %21

  %23 = bitcast float* %22 to i32*

  %24 = load i32, i32* %23, align 4

  %25 = bitcast float* %1 to i32*

  store i32 %12, i32* %25, align 4

  %26 = getelementptr inbounds float, float* %1, i64 1

  %27 = bitcast float* %26 to i32*

  store i32 %16, i32* %27, align 4

  %28 = getelementptr inbounds float, float* %1, i64 2

  %29 = bitcast float* %28 to i32*

  store i32 %20, i32* %29, align 4

  %30 = getelementptr inbounds float, float* %1, i64 3

  %31 = bitcast float* %30 to i32*

  store i32 %24, i32* %31, align 4

  ret void

}

gather_swizzle(long long __vector(2) const*, float*): # @gather_swizzle(long

long __vector(2) const*, float*)

  vmovdqa (%rdi), %xmm0

  vpextrq $1, %xmm0, %rax

  vmovq %xmm0, %rdx

  movl %eax, %ecx

  sarq $32, %rax

  movl %edx, %edi

  sarq $32, %rdx

  movl %edx, %edx

  movl %eax, %eax

  movl (%rsi,%rdi,4), %edi

  movl (%rsi,%rcx,4), %ecx

  movl (%rsi,%rdx,4), %edx

  movl (%rsi,%rax,4), %eax

  movl %edi, (%rsi)

  movl %edx, 4(%rsi)

  movl %ecx, 8(%rsi)

  movl %eax, 12(%rsi)

  retq

Could be:

gather_swizzle(long long __vector(2) const*, float*): # @gather_swizzle(long

long __vector(2) const*, float*)

  vmovdqa (%rdi), %xmm0

  vpextrq $1, %xmm0, %rax

  vmovq %xmm0, %rdx

  movl %eax, %ecx

  shrq $32, %rax

  movl %edx, %edi

  shrq $32, %rdx

  movl (%rsi,%rdi,4), %edi

  movl (%rsi,%rcx,4), %ecx

  movl (%rsi,%rdx,4), %edx

  movl (%rsi,%rax,4), %eax

  movl %edi, (%rsi)

  movl %edx, 4(%rsi)

  movl %ecx, 8(%rsi)

  movl %eax, 12(%rsi)

  retq</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>