[llvm-bugs] [Bug 34209] New: Poor extension of extracted integer from vectors

via llvm-bugs llvm-bugs at lists.llvm.org
Wed Aug 16 04:11:54 PDT 2017


https://bugs.llvm.org/show_bug.cgi?id=34209

            Bug ID: 34209
           Summary: Poor extension of extracted integer from vectors
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: llvm-dev at redking.me.uk
                CC: llvm-bugs at lists.llvm.org, spatel+llvm at rotateright.com

In the gather+swizzle+store pattern below we are extracting the <4 x i32>
indices as a <2 x i64> and then splitting each element into <2 x i32>.

This is fine and might reduce the pressure on the fpu-integer pipe, but we are
performing a ASHR+MOV to extract the upper i32 bits instead of just a LSHR.

Hopefully this is just missing a DAGCombine pattern, but I haven't
investigated.

https://godbolt.org/g/JAUnqy

#include <x86intrin.h>
#include <stdint.h>

void gather_swizzle(const __m128i *indices, float *buffer)
{
  __m128i idx = *indices++;
  uint32_t idx0 = static_cast<uint32_t>(_mm_extract_epi32(idx, 0));
  uint32_t idx1 = static_cast<uint32_t>(_mm_extract_epi32(idx, 1));
  uint32_t idx2 = static_cast<uint32_t>(_mm_extract_epi32(idx, 2));
  uint32_t idx3 = static_cast<uint32_t>(_mm_extract_epi32(idx, 3));

  float sum0 = buffer[idx0];
  float sum1 = buffer[idx1];
  float sum2 = buffer[idx2];
  float sum3 = buffer[idx3];

  buffer[0] = sum0;
  buffer[1] = sum1;
  buffer[2] = sum2;
  buffer[3] = sum3;
}

llvm -mcpu=btver2 -mtriple=x86_64-unknown

define void @gather_swizzle((<2 x i64>* nocapture readonly, float* nocapture)
local_unnamed_addr #0 {
  %3 = bitcast <2 x i64>* %0 to <4 x i32>*
  %4 = load <4 x i32>, <4 x i32>* %3, align 16
  %5 = extractelement <4 x i32> %4, i32 0
  %6 = extractelement <4 x i32> %4, i32 1
  %7 = extractelement <4 x i32> %4, i32 2
  %8 = extractelement <4 x i32> %4, i32 3
  %9 = zext i32 %5 to i64
  %10 = getelementptr inbounds float, float* %1, i64 %9
  %11 = bitcast float* %10 to i32*
  %12 = load i32, i32* %11, align 4
  %13 = zext i32 %6 to i64
  %14 = getelementptr inbounds float, float* %1, i64 %13
  %15 = bitcast float* %14 to i32*
  %16 = load i32, i32* %15, align 4
  %17 = zext i32 %7 to i64
  %18 = getelementptr inbounds float, float* %1, i64 %17
  %19 = bitcast float* %18 to i32*
  %20 = load i32, i32* %19, align 4
  %21 = zext i32 %8 to i64
  %22 = getelementptr inbounds float, float* %1, i64 %21
  %23 = bitcast float* %22 to i32*
  %24 = load i32, i32* %23, align 4
  %25 = bitcast float* %1 to i32*
  store i32 %12, i32* %25, align 4
  %26 = getelementptr inbounds float, float* %1, i64 1
  %27 = bitcast float* %26 to i32*
  store i32 %16, i32* %27, align 4
  %28 = getelementptr inbounds float, float* %1, i64 2
  %29 = bitcast float* %28 to i32*
  store i32 %20, i32* %29, align 4
  %30 = getelementptr inbounds float, float* %1, i64 3
  %31 = bitcast float* %30 to i32*
  store i32 %24, i32* %31, align 4
  ret void
}

gather_swizzle(long long __vector(2) const*, float*): # @gather_swizzle(long
long __vector(2) const*, float*)
  vmovdqa (%rdi), %xmm0
  vpextrq $1, %xmm0, %rax
  vmovq %xmm0, %rdx
  movl %eax, %ecx
  sarq $32, %rax
  movl %edx, %edi
  sarq $32, %rdx
  movl %edx, %edx
  movl %eax, %eax
  movl (%rsi,%rdi,4), %edi
  movl (%rsi,%rcx,4), %ecx
  movl (%rsi,%rdx,4), %edx
  movl (%rsi,%rax,4), %eax
  movl %edi, (%rsi)
  movl %edx, 4(%rsi)
  movl %ecx, 8(%rsi)
  movl %eax, 12(%rsi)
  retq

Could be:

gather_swizzle(long long __vector(2) const*, float*): # @gather_swizzle(long
long __vector(2) const*, float*)
  vmovdqa (%rdi), %xmm0
  vpextrq $1, %xmm0, %rax
  vmovq %xmm0, %rdx
  movl %eax, %ecx
  shrq $32, %rax
  movl %edx, %edi
  shrq $32, %rdx
  movl (%rsi,%rdi,4), %edi
  movl (%rsi,%rcx,4), %ecx
  movl (%rsi,%rdx,4), %edx
  movl (%rsi,%rax,4), %eax
  movl %edi, (%rsi)
  movl %edx, 4(%rsi)
  movl %ecx, 8(%rsi)
  movl %eax, 12(%rsi)
  retq

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170816/58d0fb58/attachment.html>


More information about the llvm-bugs mailing list