[llvm-bugs] [Bug 33830] New: [AVX] clang does not respect streaming store intrinsics

Mon Jul 17 17:17:16 PDT 2017

https://bugs.llvm.org/show_bug.cgi?id=33830

            Bug ID: 33830
           Summary: [AVX] clang does not respect streaming store
                    intrinsics
           Product: clang
           Version: 4.0
          Hardware: All
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: LLVM Codegen
          Assignee: unassignedclangbugs at nondot.org
          Reporter: manfred.liebmann at uni-graz.at
                CC: llvm-bugs at lists.llvm.org

The AVX streaming store intrinsic _mm256_stream_pd is not translated by clang
4.0.1 to VMOVNTPD but to VMOVUPD. This leads to severe performance degradation.

This bug is not present in the official release of clang 3.8.0.
This seems to be related to the changes introduced with
__builtin_nontemporal_store.

Streaming stores with SSE instruction set seems to be not affected.

Sample code:
#include <immintrin.h>

int main() {
        int n = 1024;
        //AVX
        double* x = (double*)_mm_malloc( sizeof(double)*n, 32 );
    __m256d a = _mm256_set1_pd(2017.0717);
        for (int i = 0; i < 1024; i+=4) {
                _mm256_stream_pd(x+i, a);
        }
        //SEE
        double* y = (double*)_mm_malloc( sizeof(double)*n, 32 );
        __m128d b = _mm_set1_pd(2017.0717);
        for (int i = 0; i < 1024; i+=4) {
                _mm_stream_pd(y+i, b);
        }
        return 0;
}

# clang -S -O3 -march=native -o bug.s bug.c

------------------------------
Assembler code for clang 4.0.1
==============================
        .section        __TEXT,__text,regular,pure_instructions
        .macosx_version_min 10, 12
        .section        __TEXT,__literal8,8byte_literals
        .p2align        3
LCPI0_0:
        .quad   4656585990599183486     ## double 2017.0717
        .section        __TEXT,__literal16,16byte_literals
        .p2align        4
LCPI0_1:
        .quad   4656585990599183486     ## double 2017.0717
        .quad   4656585990599183486     ## double 2017.0717
        .section        __TEXT,__text,regular,pure_instructions
        .globl  _main
        .p2align        4, 0x90
_main:                                  ## @main
        .cfi_startproc
## BB#0:
        pushq   %rbp
Lcfi0:
        .cfi_def_cfa_offset 16
Lcfi1:
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
Lcfi2:
        .cfi_def_cfa_register %rbp
        subq    $16, %rsp
        leaq    -8(%rbp), %rdi
        movl    $32, %esi
        movl    $8192, %edx             ## imm = 0x2000
        callq   _posix_memalign
        xorl    %ecx, %ecx
        testl   %eax, %eax
        movq    -8(%rbp), %rax
        cmovneq %rcx, %rax
        vbroadcastsd    LCPI0_0(%rip), %ymm0
        .p2align        4, 0x90
LBB0_1:                                 ## =>This Inner Loop Header: Depth=1
        vmovups %ymm0, (%rax,%rcx,8)
        vmovups %ymm0, 32(%rax,%rcx,8)
        vmovups %ymm0, 64(%rax,%rcx,8)
        vmovups %ymm0, 96(%rax,%rcx,8)
        vmovups %ymm0, 128(%rax,%rcx,8)
        vmovups %ymm0, 160(%rax,%rcx,8)
        vmovups %ymm0, 192(%rax,%rcx,8)
        vmovups %ymm0, 224(%rax,%rcx,8)
        vmovups %ymm0, 256(%rax,%rcx,8)
        vmovups %ymm0, 288(%rax,%rcx,8)
        vmovups %ymm0, 320(%rax,%rcx,8)
        vmovups %ymm0, 352(%rax,%rcx,8)
        vmovups %ymm0, 384(%rax,%rcx,8)
        vmovups %ymm0, 416(%rax,%rcx,8)
        vmovups %ymm0, 448(%rax,%rcx,8)
        vmovups %ymm0, 480(%rax,%rcx,8)
        addq    $64, %rcx
        cmpq    $1024, %rcx             ## imm = 0x400
        jl      LBB0_1
## BB#2:
        leaq    -8(%rbp), %rdi
        movl    $32, %esi
        movl    $8192, %edx             ## imm = 0x2000
        vzeroupper
        callq   _posix_memalign
        xorl    %ecx, %ecx
        testl   %eax, %eax
        movq    -8(%rbp), %rax
        cmovneq %rcx, %rax
        vmovaps LCPI0_1(%rip), %xmm0    ## xmm0 = [2.017072e+03,2.017072e+03]
        .p2align        4, 0x90
LBB0_3:                                 ## =>This Inner Loop Header: Depth=1
        vmovntps        %xmm0, (%rax,%rcx,8)
        vmovntps        %xmm0, 32(%rax,%rcx,8)
        vmovntps        %xmm0, 64(%rax,%rcx,8)
        vmovntps        %xmm0, 96(%rax,%rcx,8)
        vmovntps        %xmm0, 128(%rax,%rcx,8)
        vmovntps        %xmm0, 160(%rax,%rcx,8)
        vmovntps        %xmm0, 192(%rax,%rcx,8)
        vmovntps        %xmm0, 224(%rax,%rcx,8)
        vmovntps        %xmm0, 256(%rax,%rcx,8)
        vmovntps        %xmm0, 288(%rax,%rcx,8)
        vmovntps        %xmm0, 320(%rax,%rcx,8)
        vmovntps        %xmm0, 352(%rax,%rcx,8)
        vmovntps        %xmm0, 384(%rax,%rcx,8)
        vmovntps        %xmm0, 416(%rax,%rcx,8)
        vmovntps        %xmm0, 448(%rax,%rcx,8)
        vmovntps        %xmm0, 480(%rax,%rcx,8)
        addq    $64, %rcx
        cmpq    $1024, %rcx             ## imm = 0x400
        jl      LBB0_3
## BB#4:
        xorl    %eax, %eax
        addq    $16, %rsp
        popq    %rbp
        retq
        .cfi_endproc

.subsections_via_symbols

------------------------------
Assembler code for clang 3.8.0
==============================

        .section        __TEXT,__text,regular,pure_instructions
        .macosx_version_min 10, 12
        .section        __TEXT,__literal8,8byte_literals
        .align  3
LCPI0_0:
        .quad   4656585990599183486     ## double 2017.0717
        .section        __TEXT,__literal16,16byte_literals
        .align  4
LCPI0_1:
        .quad   4656585990599183486     ## double 2017.0717
        .quad   4656585990599183486     ## double 2017.0717
        .section        __TEXT,__text,regular,pure_instructions
        .globl  _main
        .align  4, 0x90
_main:                                  ## @main
        .cfi_startproc
## BB#0:
        pushq   %rbp
Ltmp0:
        .cfi_def_cfa_offset 16
Ltmp1:
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
Ltmp2:
        .cfi_def_cfa_register %rbp
        subq    $16, %rsp
        leaq    -8(%rbp), %rdi
        movl    $32, %esi
        movl    $8192, %edx             ## imm = 0x2000
        callq   _posix_memalign
        xorl    %ecx, %ecx
        testl   %eax, %eax
        movq    -8(%rbp), %rax
        cmovneq %rcx, %rax
        vbroadcastsd    LCPI0_0(%rip), %ymm0
        .align  4, 0x90
LBB0_1:                                 ## =>This Inner Loop Header: Depth=1
        vmovntps        %ymm0, (%rax,%rcx,8)
        vmovntps        %ymm0, 32(%rax,%rcx,8)
        vmovntps        %ymm0, 64(%rax,%rcx,8)
        vmovntps        %ymm0, 96(%rax,%rcx,8)
        vmovntps        %ymm0, 128(%rax,%rcx,8)
        vmovntps        %ymm0, 160(%rax,%rcx,8)
        vmovntps        %ymm0, 192(%rax,%rcx,8)
        vmovntps        %ymm0, 224(%rax,%rcx,8)
        vmovntps        %ymm0, 256(%rax,%rcx,8)
        vmovntps        %ymm0, 288(%rax,%rcx,8)
        vmovntps        %ymm0, 320(%rax,%rcx,8)
        vmovntps        %ymm0, 352(%rax,%rcx,8)
        vmovntps        %ymm0, 384(%rax,%rcx,8)
        vmovntps        %ymm0, 416(%rax,%rcx,8)
        vmovntps        %ymm0, 448(%rax,%rcx,8)
        vmovntps        %ymm0, 480(%rax,%rcx,8)
        addq    $64, %rcx
        cmpq    $1024, %rcx             ## imm = 0x400
        jl      LBB0_1
## BB#2:
        leaq    -8(%rbp), %rdi
        movl    $32, %esi
        movl    $8192, %edx             ## imm = 0x2000
        vzeroupper
        callq   _posix_memalign
        xorl    %ecx, %ecx
        testl   %eax, %eax
        movq    -8(%rbp), %rax
        cmovneq %rcx, %rax
        vmovaps LCPI0_1(%rip), %xmm0    ## xmm0 = [2.017072e+03,2.017072e+03]
        .align  4, 0x90
LBB0_3:                                 ## =>This Inner Loop Header: Depth=1
        vmovntps        %xmm0, (%rax,%rcx,8)
        vmovntps        %xmm0, 32(%rax,%rcx,8)
        vmovntps        %xmm0, 64(%rax,%rcx,8)
        vmovntps        %xmm0, 96(%rax,%rcx,8)
        vmovntps        %xmm0, 128(%rax,%rcx,8)
        vmovntps        %xmm0, 160(%rax,%rcx,8)
        vmovntps        %xmm0, 192(%rax,%rcx,8)
        vmovntps        %xmm0, 224(%rax,%rcx,8)
        vmovntps        %xmm0, 256(%rax,%rcx,8)
        vmovntps        %xmm0, 288(%rax,%rcx,8)
        vmovntps        %xmm0, 320(%rax,%rcx,8)
        vmovntps        %xmm0, 352(%rax,%rcx,8)
        vmovntps        %xmm0, 384(%rax,%rcx,8)
        vmovntps        %xmm0, 416(%rax,%rcx,8)
        vmovntps        %xmm0, 448(%rax,%rcx,8)
        vmovntps        %xmm0, 480(%rax,%rcx,8)
        addq    $64, %rcx
        cmpq    $1024, %rcx             ## imm = 0x400
        jl      LBB0_3
## BB#4:
        xorl    %eax, %eax
        addq    $16, %rsp
        popq    %rbp
        retq
        .cfi_endproc

.subsections_via_symbols

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170718/f459dced/attachment-0001.html>