<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Poor extension of extracted integer from vectors"
href="https://bugs.llvm.org/show_bug.cgi?id=34209">34209</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Poor extension of extracted integer from vectors
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Windows NT
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>llvm-dev@redking.me.uk
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org, spatel+llvm@rotateright.com
</td>
</tr></table>
<p>
<div>
<pre>In the gather+swizzle+store pattern below we are extracting the <4 x i32>
indices as a <2 x i64> and then splitting each element into <2 x i32>.
This is fine and might reduce the pressure on the fpu-integer pipe, but we are
performing a ASHR+MOV to extract the upper i32 bits instead of just a LSHR.
Hopefully this is just missing a DAGCombine pattern, but I haven't
investigated.
<a href="https://godbolt.org/g/JAUnqy">https://godbolt.org/g/JAUnqy</a>
#include <x86intrin.h>
#include <stdint.h>
void gather_swizzle(const __m128i *indices, float *buffer)
{
__m128i idx = *indices++;
uint32_t idx0 = static_cast<uint32_t>(_mm_extract_epi32(idx, 0));
uint32_t idx1 = static_cast<uint32_t>(_mm_extract_epi32(idx, 1));
uint32_t idx2 = static_cast<uint32_t>(_mm_extract_epi32(idx, 2));
uint32_t idx3 = static_cast<uint32_t>(_mm_extract_epi32(idx, 3));
float sum0 = buffer[idx0];
float sum1 = buffer[idx1];
float sum2 = buffer[idx2];
float sum3 = buffer[idx3];
buffer[0] = sum0;
buffer[1] = sum1;
buffer[2] = sum2;
buffer[3] = sum3;
}
llvm -mcpu=btver2 -mtriple=x86_64-unknown
define void @gather_swizzle((<2 x i64>* nocapture readonly, float* nocapture)
local_unnamed_addr #0 {
%3 = bitcast <2 x i64>* %0 to <4 x i32>*
%4 = load <4 x i32>, <4 x i32>* %3, align 16
%5 = extractelement <4 x i32> %4, i32 0
%6 = extractelement <4 x i32> %4, i32 1
%7 = extractelement <4 x i32> %4, i32 2
%8 = extractelement <4 x i32> %4, i32 3
%9 = zext i32 %5 to i64
%10 = getelementptr inbounds float, float* %1, i64 %9
%11 = bitcast float* %10 to i32*
%12 = load i32, i32* %11, align 4
%13 = zext i32 %6 to i64
%14 = getelementptr inbounds float, float* %1, i64 %13
%15 = bitcast float* %14 to i32*
%16 = load i32, i32* %15, align 4
%17 = zext i32 %7 to i64
%18 = getelementptr inbounds float, float* %1, i64 %17
%19 = bitcast float* %18 to i32*
%20 = load i32, i32* %19, align 4
%21 = zext i32 %8 to i64
%22 = getelementptr inbounds float, float* %1, i64 %21
%23 = bitcast float* %22 to i32*
%24 = load i32, i32* %23, align 4
%25 = bitcast float* %1 to i32*
store i32 %12, i32* %25, align 4
%26 = getelementptr inbounds float, float* %1, i64 1
%27 = bitcast float* %26 to i32*
store i32 %16, i32* %27, align 4
%28 = getelementptr inbounds float, float* %1, i64 2
%29 = bitcast float* %28 to i32*
store i32 %20, i32* %29, align 4
%30 = getelementptr inbounds float, float* %1, i64 3
%31 = bitcast float* %30 to i32*
store i32 %24, i32* %31, align 4
ret void
}
gather_swizzle(long long __vector(2) const*, float*): # @gather_swizzle(long
long __vector(2) const*, float*)
vmovdqa (%rdi), %xmm0
vpextrq $1, %xmm0, %rax
vmovq %xmm0, %rdx
movl %eax, %ecx
sarq $32, %rax
movl %edx, %edi
sarq $32, %rdx
movl %edx, %edx
movl %eax, %eax
movl (%rsi,%rdi,4), %edi
movl (%rsi,%rcx,4), %ecx
movl (%rsi,%rdx,4), %edx
movl (%rsi,%rax,4), %eax
movl %edi, (%rsi)
movl %edx, 4(%rsi)
movl %ecx, 8(%rsi)
movl %eax, 12(%rsi)
retq
Could be:
gather_swizzle(long long __vector(2) const*, float*): # @gather_swizzle(long
long __vector(2) const*, float*)
vmovdqa (%rdi), %xmm0
vpextrq $1, %xmm0, %rax
vmovq %xmm0, %rdx
movl %eax, %ecx
shrq $32, %rax
movl %edx, %edi
shrq $32, %rdx
movl (%rsi,%rdi,4), %edi
movl (%rsi,%rcx,4), %ecx
movl (%rsi,%rdx,4), %edx
movl (%rsi,%rax,4), %eax
movl %edi, (%rsi)
movl %edx, 4(%rsi)
movl %ecx, 8(%rsi)
movl %eax, 12(%rsi)
retq</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>