[llvm-bugs] [Bug 33830] New: [AVX] clang does not respect streaming store intrinsics
via llvm-bugs
llvm-bugs at lists.llvm.org
Mon Jul 17 17:17:16 PDT 2017
https://bugs.llvm.org/show_bug.cgi?id=33830
Bug ID: 33830
Summary: [AVX] clang does not respect streaming store
intrinsics
Product: clang
Version: 4.0
Hardware: All
OS: All
Status: NEW
Severity: normal
Priority: P
Component: LLVM Codegen
Assignee: unassignedclangbugs at nondot.org
Reporter: manfred.liebmann at uni-graz.at
CC: llvm-bugs at lists.llvm.org
The AVX streaming store intrinsic _mm256_stream_pd is not translated by clang
4.0.1 to VMOVNTPD but to VMOVUPD. This leads to severe performance degradation.
This bug is not present in the official release of clang 3.8.0.
This seems to be related to the changes introduced with
__builtin_nontemporal_store.
Streaming stores with SSE instruction set seems to be not affected.
Sample code:
#include <immintrin.h>
int main() {
int n = 1024;
//AVX
double* x = (double*)_mm_malloc( sizeof(double)*n, 32 );
__m256d a = _mm256_set1_pd(2017.0717);
for (int i = 0; i < 1024; i+=4) {
_mm256_stream_pd(x+i, a);
}
//SEE
double* y = (double*)_mm_malloc( sizeof(double)*n, 32 );
__m128d b = _mm_set1_pd(2017.0717);
for (int i = 0; i < 1024; i+=4) {
_mm_stream_pd(y+i, b);
}
return 0;
}
# clang -S -O3 -march=native -o bug.s bug.c
------------------------------
Assembler code for clang 4.0.1
==============================
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.section __TEXT,__literal8,8byte_literals
.p2align 3
LCPI0_0:
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__literal16,16byte_literals
.p2align 4
LCPI0_1:
.quad 4656585990599183486 ## double 2017.0717
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__text,regular,pure_instructions
.globl _main
.p2align 4, 0x90
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
subq $16, %rsp
leaq -8(%rbp), %rdi
movl $32, %esi
movl $8192, %edx ## imm = 0x2000
callq _posix_memalign
xorl %ecx, %ecx
testl %eax, %eax
movq -8(%rbp), %rax
cmovneq %rcx, %rax
vbroadcastsd LCPI0_0(%rip), %ymm0
.p2align 4, 0x90
LBB0_1: ## =>This Inner Loop Header: Depth=1
vmovups %ymm0, (%rax,%rcx,8)
vmovups %ymm0, 32(%rax,%rcx,8)
vmovups %ymm0, 64(%rax,%rcx,8)
vmovups %ymm0, 96(%rax,%rcx,8)
vmovups %ymm0, 128(%rax,%rcx,8)
vmovups %ymm0, 160(%rax,%rcx,8)
vmovups %ymm0, 192(%rax,%rcx,8)
vmovups %ymm0, 224(%rax,%rcx,8)
vmovups %ymm0, 256(%rax,%rcx,8)
vmovups %ymm0, 288(%rax,%rcx,8)
vmovups %ymm0, 320(%rax,%rcx,8)
vmovups %ymm0, 352(%rax,%rcx,8)
vmovups %ymm0, 384(%rax,%rcx,8)
vmovups %ymm0, 416(%rax,%rcx,8)
vmovups %ymm0, 448(%rax,%rcx,8)
vmovups %ymm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_1
## BB#2:
leaq -8(%rbp), %rdi
movl $32, %esi
movl $8192, %edx ## imm = 0x2000
vzeroupper
callq _posix_memalign
xorl %ecx, %ecx
testl %eax, %eax
movq -8(%rbp), %rax
cmovneq %rcx, %rax
vmovaps LCPI0_1(%rip), %xmm0 ## xmm0 = [2.017072e+03,2.017072e+03]
.p2align 4, 0x90
LBB0_3: ## =>This Inner Loop Header: Depth=1
vmovntps %xmm0, (%rax,%rcx,8)
vmovntps %xmm0, 32(%rax,%rcx,8)
vmovntps %xmm0, 64(%rax,%rcx,8)
vmovntps %xmm0, 96(%rax,%rcx,8)
vmovntps %xmm0, 128(%rax,%rcx,8)
vmovntps %xmm0, 160(%rax,%rcx,8)
vmovntps %xmm0, 192(%rax,%rcx,8)
vmovntps %xmm0, 224(%rax,%rcx,8)
vmovntps %xmm0, 256(%rax,%rcx,8)
vmovntps %xmm0, 288(%rax,%rcx,8)
vmovntps %xmm0, 320(%rax,%rcx,8)
vmovntps %xmm0, 352(%rax,%rcx,8)
vmovntps %xmm0, 384(%rax,%rcx,8)
vmovntps %xmm0, 416(%rax,%rcx,8)
vmovntps %xmm0, 448(%rax,%rcx,8)
vmovntps %xmm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_3
## BB#4:
xorl %eax, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
------------------------------
Assembler code for clang 3.8.0
==============================
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.section __TEXT,__literal8,8byte_literals
.align 3
LCPI0_0:
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__literal16,16byte_literals
.align 4
LCPI0_1:
.quad 4656585990599183486 ## double 2017.0717
.quad 4656585990599183486 ## double 2017.0717
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## @main
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
subq $16, %rsp
leaq -8(%rbp), %rdi
movl $32, %esi
movl $8192, %edx ## imm = 0x2000
callq _posix_memalign
xorl %ecx, %ecx
testl %eax, %eax
movq -8(%rbp), %rax
cmovneq %rcx, %rax
vbroadcastsd LCPI0_0(%rip), %ymm0
.align 4, 0x90
LBB0_1: ## =>This Inner Loop Header: Depth=1
vmovntps %ymm0, (%rax,%rcx,8)
vmovntps %ymm0, 32(%rax,%rcx,8)
vmovntps %ymm0, 64(%rax,%rcx,8)
vmovntps %ymm0, 96(%rax,%rcx,8)
vmovntps %ymm0, 128(%rax,%rcx,8)
vmovntps %ymm0, 160(%rax,%rcx,8)
vmovntps %ymm0, 192(%rax,%rcx,8)
vmovntps %ymm0, 224(%rax,%rcx,8)
vmovntps %ymm0, 256(%rax,%rcx,8)
vmovntps %ymm0, 288(%rax,%rcx,8)
vmovntps %ymm0, 320(%rax,%rcx,8)
vmovntps %ymm0, 352(%rax,%rcx,8)
vmovntps %ymm0, 384(%rax,%rcx,8)
vmovntps %ymm0, 416(%rax,%rcx,8)
vmovntps %ymm0, 448(%rax,%rcx,8)
vmovntps %ymm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_1
## BB#2:
leaq -8(%rbp), %rdi
movl $32, %esi
movl $8192, %edx ## imm = 0x2000
vzeroupper
callq _posix_memalign
xorl %ecx, %ecx
testl %eax, %eax
movq -8(%rbp), %rax
cmovneq %rcx, %rax
vmovaps LCPI0_1(%rip), %xmm0 ## xmm0 = [2.017072e+03,2.017072e+03]
.align 4, 0x90
LBB0_3: ## =>This Inner Loop Header: Depth=1
vmovntps %xmm0, (%rax,%rcx,8)
vmovntps %xmm0, 32(%rax,%rcx,8)
vmovntps %xmm0, 64(%rax,%rcx,8)
vmovntps %xmm0, 96(%rax,%rcx,8)
vmovntps %xmm0, 128(%rax,%rcx,8)
vmovntps %xmm0, 160(%rax,%rcx,8)
vmovntps %xmm0, 192(%rax,%rcx,8)
vmovntps %xmm0, 224(%rax,%rcx,8)
vmovntps %xmm0, 256(%rax,%rcx,8)
vmovntps %xmm0, 288(%rax,%rcx,8)
vmovntps %xmm0, 320(%rax,%rcx,8)
vmovntps %xmm0, 352(%rax,%rcx,8)
vmovntps %xmm0, 384(%rax,%rcx,8)
vmovntps %xmm0, 416(%rax,%rcx,8)
vmovntps %xmm0, 448(%rax,%rcx,8)
vmovntps %xmm0, 480(%rax,%rcx,8)
addq $64, %rcx
cmpq $1024, %rcx ## imm = 0x400
jl LBB0_3
## BB#4:
xorl %eax, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170718/f459dced/attachment-0001.html>
More information about the llvm-bugs
mailing list