[llvm] r320379 - [X86] Modify Nontemporal tests to avoid deadstore optimization.
Nirav Dave via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 11 07:35:41 PST 2017
Author: niravd
Date: Mon Dec 11 07:35:40 2017
New Revision: 320379
URL: http://llvm.org/viewvc/llvm-project?rev=320379&view=rev
Log:
[X86] Modify Nontemporal tests to avoid deadstore optimization.
Modified:
llvm/trunk/test/CodeGen/X86/avx2-nontemporal.ll
llvm/trunk/test/CodeGen/X86/avx512-nontemporal.ll
llvm/trunk/test/CodeGen/X86/avx512vl-nontemporal.ll
llvm/trunk/test/CodeGen/X86/nontemporal.ll
Modified: llvm/trunk/test/CodeGen/X86/avx2-nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-nontemporal.ll?rev=320379&r1=320378&r2=320379&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-nontemporal.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-nontemporal.ll Mon Dec 11 07:35:40 2017
@@ -2,7 +2,7 @@
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X64
-define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %F, <16 x i16> %G, <32 x i8> %H) nounwind {
+define i32 @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %F, <16 x i16> %G, <32 x i8> %H, i32* %loadptr) nounwind {
; X32-LABEL: f:
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
@@ -12,19 +12,26 @@ define void @f(<8 x float> %A, i8* %B, <
; X32-NEXT: vmovdqa 104(%ebp), %ymm3
; X32-NEXT: vmovdqa 72(%ebp), %ymm4
; X32-NEXT: vmovdqa 40(%ebp), %ymm5
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: vaddps .LCPI0_0, %ymm0, %ymm0
-; X32-NEXT: vmovntps %ymm0, (%eax)
-; X32-NEXT: vpaddq .LCPI0_1, %ymm2, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
-; X32-NEXT: vaddpd .LCPI0_2, %ymm1, %ymm0
-; X32-NEXT: vmovntpd %ymm0, (%eax)
-; X32-NEXT: vpaddd .LCPI0_3, %ymm5, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
-; X32-NEXT: vpaddw .LCPI0_4, %ymm4, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
-; X32-NEXT: vpaddb .LCPI0_5, %ymm3, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: movl 136(%ebp), %edx
+; X32-NEXT: movl (%edx), %eax
+; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT: vmovntps %ymm0, (%ecx)
+; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm2, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
+; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm1, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntpd %ymm0, (%ecx)
+; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm5, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
+; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm4, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
+; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm3, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: vzeroupper
@@ -32,39 +39,58 @@ define void @f(<8 x float> %A, i8* %B, <
;
; X64-LABEL: f:
; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %eax
; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovntps %ymm0, (%rdi)
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vaddpd {{.*}}(%rip), %ymm1, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntpd %ymm0, (%rdi)
; X64-NEXT: vpaddd {{.*}}(%rip), %ymm3, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vpaddw {{.*}}(%rip), %ymm4, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vpaddb {{.*}}(%rip), %ymm5, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <8 x float>*
%A2 = fadd <8 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <4 x double>*
%C2 = fadd <4 x double> %C, <double 1.0, double 2.0, double 3.0, double 4.0>
store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
+ %v3 = load i32, i32* %loadptr, align 1
%cast3 = bitcast i8* %B to <8 x i32>*
%F2 = add <8 x i32> %F, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
store <8 x i32> %F2, <8 x i32>* %cast3, align 32, !nontemporal !0
+ %v4 = load i32, i32* %loadptr, align 1
%cast4 = bitcast i8* %B to <16 x i16>*
%G2 = add <16 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
store <16 x i16> %G2, <16 x i16>* %cast4, align 32, !nontemporal !0
+ %v5 = load i32, i32* %loadptr, align 1
%cast5 = bitcast i8* %B to <32 x i8>*
%H2 = add <32 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
store <32 x i8> %H2, <32 x i8>* %cast5, align 32, !nontemporal !0
- ret void
+ %v6 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ %sum4 = add i32 %sum3, %v4
+ %sum5 = add i32 %sum4, %v5
+ %sum6 = add i32 %sum5, %v6
+ ret i32 %sum5
}
!0 = !{i32 1}
Modified: llvm/trunk/test/CodeGen/X86/avx512-nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-nontemporal.ll?rev=320379&r1=320378&r2=320379&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-nontemporal.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-nontemporal.ll Mon Dec 11 07:35:40 2017
@@ -1,31 +1,44 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s
-define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH) {
+define i32 @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH, i32 * %loadptr) {
; CHECK: vmovntps %z
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <16 x float>*
%A2 = fadd <16 x float> %A, %AA
store <16 x float> %A2, <16 x float>* %cast, align 64, !nontemporal !0
+ %v1 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast1 = bitcast i8* %B to <8 x i64>*
%E2 = add <8 x i64> %E, %EE
store <8 x i64> %E2, <8 x i64>* %cast1, align 64, !nontemporal !0
+ %v2 = load i32, i32* %loadptr, align 1
; CHECK: vmovntpd %z
%cast2 = bitcast i8* %B to <8 x double>*
%C2 = fadd <8 x double> %C, %CC
store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0
+ %v3 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast3 = bitcast i8* %B to <16 x i32>*
%F2 = add <16 x i32> %F, %FF
store <16 x i32> %F2, <16 x i32>* %cast3, align 64, !nontemporal !0
+ %v4 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast4 = bitcast i8* %B to <32 x i16>*
%G2 = add <32 x i16> %G, %GG
store <32 x i16> %G2, <32 x i16>* %cast4, align 64, !nontemporal !0
+ %v5 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast5 = bitcast i8* %B to <64 x i8>*
%H2 = add <64 x i8> %H, %HH
store <64 x i8> %H2, <64 x i8>* %cast5, align 64, !nontemporal !0
- ret void
+ %v6 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ %sum4 = add i32 %sum3, %v4
+ %sum5 = add i32 %sum4, %v5
+ %sum6 = add i32 %sum5, %v6
+ ret i32 %sum6
}
!0 = !{i32 1}
Modified: llvm/trunk/test/CodeGen/X86/avx512vl-nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-nontemporal.ll?rev=320379&r1=320378&r2=320379&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-nontemporal.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-nontemporal.ll Mon Dec 11 07:35:40 2017
@@ -1,34 +1,48 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
-define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) {
+define i32 @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE, i32* %loadptr) {
; CHECK: vmovntps %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <8 x float>*
%A2 = fadd <8 x float> %A, %AA
store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, %EE
store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <4 x double>*
%C2 = fadd <4 x double> %C, %CC
store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0
- ret void
+ %v3 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ ret i32 %sum3
}
-define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) {
+define i32 @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE, i32* %loadptr) {
+ %v0 = load i32, i32* %loadptr, align 1
; CHECK: vmovntps %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
%cast = bitcast i8* %B to <4 x float>*
%A2 = fadd <4 x float> %A, %AA
store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, %EE
store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <2 x double>*
%C2 = fadd <2 x double> %C, %CC
store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
- ret void
+ %v3 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ ret i32 %sum3
}
!0 = !{i32 1}
Modified: llvm/trunk/test/CodeGen/X86/nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal.ll?rev=320379&r1=320378&r2=320379&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal.ll Mon Dec 11 07:35:40 2017
@@ -4,34 +4,50 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
-define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 x i32> %F, <8 x i16> %G, <16 x i8> %H, i64 %I) nounwind {
+define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 x i32> %F, <8 x i16> %G, <16 x i8> %H, i64 %I, i32* %loadptr) nounwind {
; X32-SSE-LABEL: f:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
+; X32-SSE-NEXT: pushl %edi
+; X32-SSE-NEXT: pushl %esi
; X32-SSE-NEXT: andl $-16, %esp
-; X32-SSE-NEXT: subl $16, %esp
-; X32-SSE-NEXT: movl 72(%ebp), %eax
; X32-SSE-NEXT: movl 76(%ebp), %ecx
+; X32-SSE-NEXT: movl 12(%ebp), %eax
; X32-SSE-NEXT: movdqa 56(%ebp), %xmm3
; X32-SSE-NEXT: movdqa 40(%ebp), %xmm4
; X32-SSE-NEXT: movdqa 24(%ebp), %xmm5
-; X32-SSE-NEXT: movl 8(%ebp), %edx
+; X32-SSE-NEXT: movl 8(%ebp), %esi
+; X32-SSE-NEXT: movl 80(%ebp), %edx
+; X32-SSE-NEXT: movl (%edx), %edi
; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movntps %xmm0, (%edx)
+; X32-SSE-NEXT: movntps %xmm0, (%esi)
; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: movntdq %xmm2, (%edx)
+; X32-SSE-NEXT: addl (%edx), %edi
+; X32-SSE-NEXT: movntdq %xmm2, (%esi)
; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movntpd %xmm1, (%edx)
+; X32-SSE-NEXT: addl (%edx), %edi
+; X32-SSE-NEXT: movntpd %xmm1, (%esi)
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm5
-; X32-SSE-NEXT: movntdq %xmm5, (%edx)
+; X32-SSE-NEXT: addl (%edx), %edi
+; X32-SSE-NEXT: movntdq %xmm5, (%esi)
; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm4
-; X32-SSE-NEXT: movntdq %xmm4, (%edx)
+; X32-SSE-NEXT: addl (%edx), %edi
+; X32-SSE-NEXT: movntdq %xmm4, (%esi)
; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT: movntdq %xmm3, (%edx)
-; X32-SSE-NEXT: movntil %ecx, 4(%edx)
-; X32-SSE-NEXT: movntil %eax, (%edx)
-; X32-SSE-NEXT: movl %ebp, %esp
+; X32-SSE-NEXT: addl (%edx), %edi
+; X32-SSE-NEXT: movntdq %xmm3, (%esi)
+; X32-SSE-NEXT: addl (%edx), %edi
+; X32-SSE-NEXT: movntil %eax, (%esi)
+; X32-SSE-NEXT: movl (%edx), %eax
+; X32-SSE-NEXT: movntil %ecx, 4(%esi)
+; X32-SSE-NEXT: movl 72(%ebp), %ecx
+; X32-SSE-NEXT: movntil %ecx, (%esi)
+; X32-SSE-NEXT: addl %edi, %eax
+; X32-SSE-NEXT: addl (%edx), %eax
+; X32-SSE-NEXT: leal -8(%ebp), %esp
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: popl %edi
; X32-SSE-NEXT: popl %ebp
; X32-SSE-NEXT: retl
;
@@ -39,90 +55,141 @@ define void @f(<4 x float> %A, i8* %B, <
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
+; X32-AVX-NEXT: pushl %edi
+; X32-AVX-NEXT: pushl %esi
; X32-AVX-NEXT: andl $-16, %esp
-; X32-AVX-NEXT: subl $16, %esp
-; X32-AVX-NEXT: movl 72(%ebp), %eax
; X32-AVX-NEXT: movl 76(%ebp), %ecx
+; X32-AVX-NEXT: movl 12(%ebp), %eax
; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm3
; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm4
; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm5
-; X32-AVX-NEXT: movl 8(%ebp), %edx
+; X32-AVX-NEXT: movl 8(%ebp), %esi
+; X32-AVX-NEXT: movl 80(%ebp), %edx
+; X32-AVX-NEXT: movl (%edx), %edi
; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-AVX-NEXT: vmovntps %xmm0, (%edx)
+; X32-AVX-NEXT: vmovntps %xmm0, (%esi)
; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: addl (%edx), %edi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
-; X32-AVX-NEXT: vmovntpd %xmm0, (%edx)
+; X32-AVX-NEXT: addl (%edx), %edi
+; X32-AVX-NEXT: vmovntpd %xmm0, (%esi)
; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm5, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: addl (%edx), %edi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm4, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: addl (%edx), %edi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm3, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
-; X32-AVX-NEXT: movntil %ecx, 4(%edx)
-; X32-AVX-NEXT: movntil %eax, (%edx)
-; X32-AVX-NEXT: movl %ebp, %esp
+; X32-AVX-NEXT: addl (%edx), %edi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: addl (%edx), %edi
+; X32-AVX-NEXT: movntil %eax, (%esi)
+; X32-AVX-NEXT: movl (%edx), %eax
+; X32-AVX-NEXT: movntil %ecx, 4(%esi)
+; X32-AVX-NEXT: movl 72(%ebp), %ecx
+; X32-AVX-NEXT: movntil %ecx, (%esi)
+; X32-AVX-NEXT: addl %edi, %eax
+; X32-AVX-NEXT: addl (%edx), %eax
+; X32-AVX-NEXT: leal -8(%ebp), %esp
+; X32-AVX-NEXT: popl %esi
+; X32-AVX-NEXT: popl %edi
; X32-AVX-NEXT: popl %ebp
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: f:
; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: movl (%rcx), %eax
; X64-SSE-NEXT: addps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: movntps %xmm0, (%rdi)
; X64-SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm2, (%rdi)
; X64-SSE-NEXT: addpd {{.*}}(%rip), %xmm1
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntpd %xmm1, (%rdi)
; X64-SSE-NEXT: paddd {{.*}}(%rip), %xmm3
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm3, (%rdi)
; X64-SSE-NEXT: paddw {{.*}}(%rip), %xmm4
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm4, (%rdi)
; X64-SSE-NEXT: paddb {{.*}}(%rip), %xmm5
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm5, (%rdi)
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntil %esi, (%rdi)
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntiq %rdx, (%rdi)
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: f:
; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: movl (%rcx), %eax
; X64-AVX-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovntps %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
; X64-AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntpd %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %xmm3, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddw {{.*}}(%rip), %xmm4, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddb {{.*}}(%rip), %xmm5, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: movntil %esi, (%rdi)
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: movntiq %rdx, (%rdi)
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: retq
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <4 x float>*
%A2 = fadd <4 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0>
store <4 x float> %A2, <4 x float>* %cast, align 16, !nontemporal !0
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, <i64 1, i64 2>
store <2 x i64> %E2, <2 x i64>* %cast1, align 16, !nontemporal !0
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <2 x double>*
%C2 = fadd <2 x double> %C, <double 1.0, double 2.0>
store <2 x double> %C2, <2 x double>* %cast2, align 16, !nontemporal !0
+ %v3 = load i32, i32* %loadptr, align 1
%cast3 = bitcast i8* %B to <4 x i32>*
%F2 = add <4 x i32> %F, <i32 1, i32 2, i32 3, i32 4>
store <4 x i32> %F2, <4 x i32>* %cast3, align 16, !nontemporal !0
+ %v4 = load i32, i32* %loadptr, align 1
%cast4 = bitcast i8* %B to <8 x i16>*
%G2 = add <8 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
store <8 x i16> %G2, <8 x i16>* %cast4, align 16, !nontemporal !0
+ %v5 = load i32, i32* %loadptr, align 1
%cast5 = bitcast i8* %B to <16 x i8>*
%H2 = add <16 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
store <16 x i8> %H2, <16 x i8>* %cast5, align 16, !nontemporal !0
+ %v6 = load i32, i32* %loadptr, align 1
%cast6 = bitcast i8* %B to i32*
store i32 %D, i32* %cast6, align 1, !nontemporal !0
+ %v7 = load i32, i32* %loadptr, align 1
%cast7 = bitcast i8* %B to i64*
store i64 %I, i64* %cast7, align 1, !nontemporal !0
- ret void
+ %v8 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ %sum4 = add i32 %sum3, %v4
+ %sum5 = add i32 %sum4, %v5
+ %sum6 = add i32 %sum5, %v6
+ %sum7 = add i32 %sum6, %v7
+ %sum8 = add i32 %sum7, %v8
+ ret i32 %sum8
}
!0 = !{i32 1}
More information about the llvm-commits
mailing list