[llvm] r363853 - [x86] avoid vector load narrowing with extracted store uses (PR42305)
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 19 11:13:47 PDT 2019
Author: spatel
Date: Wed Jun 19 11:13:47 2019
New Revision: 363853
URL: http://llvm.org/viewvc/llvm-project?rev=363853&view=rev
Log:
[x86] avoid vector load narrowing with extracted store uses (PR42305)
This is an exception to the rule that we should prefer xmm ops to ymm ops.
As shown in PR42305:
https://bugs.llvm.org/show_bug.cgi?id=42305
...the store folding opportunity with vextractf128 may result in better
perf by reducing the instruction count.
Differential Revision: https://reviews.llvm.org/D63517
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/sandybridge-loads.ll
llvm/trunk/test/CodeGen/X86/widen_load-3.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=363853&r1=363852&r2=363853&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jun 19 11:13:47 2019
@@ -4810,6 +4810,26 @@ bool X86TargetLowering::shouldReduceLoad
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+
+ // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
+ // those uses are extracted directly into a store, then the extract + store
+ // can be store-folded. Therefore, it's probably not worth splitting the load.
+ EVT VT = Load->getValueType(0);
+ if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
+ for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
+ // Skip uses of the chain value. Result 0 of the node is the load value.
+ if (UI.getUse().getResNo() != 0)
+ continue;
+
+ // If this use is not an extract + store, it's probably worth splitting.
+ if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
+ UI->use_begin()->getOpcode() != ISD::STORE)
+ return true;
+ }
+ // All non-chain uses are extract + store.
+ return false;
+ }
+
return true;
}
Modified: llvm/trunk/test/CodeGen/X86/sandybridge-loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sandybridge-loads.ll?rev=363853&r1=363852&r2=363853&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sandybridge-loads.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sandybridge-loads.ll Wed Jun 19 11:13:47 2019
@@ -30,10 +30,9 @@ define void @widestores(<8 x float>* %a,
; CHECK-LABEL: widestores:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0
-; CHECK-NEXT: vmovaps (%rsi), %xmm1
-; CHECK-NEXT: vmovaps 16(%rsi), %xmm2
+; CHECK-NEXT: vmovaps (%rsi), %ymm1
; CHECK-NEXT: vmovaps %ymm0, (%rsi)
-; CHECK-NEXT: vmovaps %xmm2, 16(%rdi)
+; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi)
; CHECK-NEXT: vmovaps %xmm1, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/widen_load-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_load-3.ll?rev=363853&r1=363852&r2=363853&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_load-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_load-3.ll Wed Jun 19 11:13:47 2019
@@ -146,10 +146,10 @@ define void @load_split(<8 x float>* %ld
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%edx), %xmm1
+; X86-AVX-NEXT: vmovups (%edx), %ymm0
; X86-AVX-NEXT: vmovups %xmm0, (%ecx)
-; X86-AVX-NEXT: vmovups %xmm1, (%eax)
+; X86-AVX-NEXT: vextractf128 $1, %ymm0, (%eax)
+; X86-AVX-NEXT: vzeroupper
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: load_split:
@@ -162,10 +162,10 @@ define void @load_split(<8 x float>* %ld
;
; X64-AVX-LABEL: load_split:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %xmm0
-; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1
+; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: vmovups %xmm0, (%rsi)
-; X64-AVX-NEXT: vmovups %xmm1, (%rdx)
+; X64-AVX-NEXT: vextractf128 $1, %ymm0, (%rdx)
+; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
%t256 = load <8 x float>, <8 x float>* %ld, align 1
%b128 = shufflevector <8 x float> %t256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -178,39 +178,35 @@ define void @load_split(<8 x float>* %ld
define void @load_split_more(float* %src, i32* %idx, float* %dst) nounwind {
; X86-SSE-LABEL: load_split_more:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl (%edx), %esi
-; X86-SSE-NEXT: movups (%ecx), %xmm0
-; X86-SSE-NEXT: movups 16(%ecx), %xmm1
-; X86-SSE-NEXT: movups %xmm0, (%eax,%esi,4)
-; X86-SSE-NEXT: movl 4(%edx), %ecx
+; X86-SSE-NEXT: movups (%edx), %xmm0
+; X86-SSE-NEXT: movups 16(%edx), %xmm1
+; X86-SSE-NEXT: movl (%ecx), %edx
+; X86-SSE-NEXT: movups %xmm0, (%eax,%edx,4)
+; X86-SSE-NEXT: movl 4(%ecx), %ecx
; X86-SSE-NEXT: movups %xmm1, (%eax,%ecx,4)
-; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: load_split_more:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: pushl %esi
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl (%edx), %esi
-; X86-AVX-NEXT: vmovups (%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%ecx), %xmm1
-; X86-AVX-NEXT: vmovups %xmm0, (%eax,%esi,4)
-; X86-AVX-NEXT: movl 4(%edx), %ecx
-; X86-AVX-NEXT: vmovups %xmm1, (%eax,%ecx,4)
-; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: vmovups (%edx), %ymm0
+; X86-AVX-NEXT: movl (%ecx), %edx
+; X86-AVX-NEXT: vmovups %xmm0, (%eax,%edx,4)
+; X86-AVX-NEXT: movl 4(%ecx), %ecx
+; X86-AVX-NEXT: vextractf128 $1, %ymm0, (%eax,%ecx,4)
+; X86-AVX-NEXT: vzeroupper
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: load_split_more:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movslq (%rsi), %rax
; X64-SSE-NEXT: movups (%rdi), %xmm0
; X64-SSE-NEXT: movups 16(%rdi), %xmm1
+; X64-SSE-NEXT: movslq (%rsi), %rax
; X64-SSE-NEXT: movups %xmm0, (%rdx,%rax,4)
; X64-SSE-NEXT: movslq 4(%rsi), %rax
; X64-SSE-NEXT: movups %xmm1, (%rdx,%rax,4)
@@ -218,12 +214,12 @@ define void @load_split_more(float* %src
;
; X64-AVX-LABEL: load_split_more:
; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: movslq (%rsi), %rax
-; X64-AVX-NEXT: vmovups (%rdi), %xmm0
-; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1
; X64-AVX-NEXT: vmovups %xmm0, (%rdx,%rax,4)
; X64-AVX-NEXT: movslq 4(%rsi), %rax
-; X64-AVX-NEXT: vmovups %xmm1, (%rdx,%rax,4)
+; X64-AVX-NEXT: vextractf128 $1, %ymm0, (%rdx,%rax,4)
+; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
%v.i = bitcast float* %src to <8 x float>*
%tmp = load <8 x float>, <8 x float>* %v.i, align 1
More information about the llvm-commits
mailing list