[llvm] r373871 - [X86][AVX] Access a scalar float/double as a free extract from a broadcast load (PR43217)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 6 14:11:45 PDT 2019
Author: rksimon
Date: Sun Oct 6 14:11:45 2019
New Revision: 373871
URL: http://llvm.org/viewvc/llvm-project?rev=373871&view=rev
Log:
[X86][AVX] Access a scalar float/double as a free extract from a broadcast load (PR43217)
If a fp scalar is loaded and then used as both a scalar and a vector broadcast, perform the load as a broadcast and then extract the scalar for 'free' from the 0th element.
This involved switching the order of the X86ISD::BROADCAST combines so we only convert to X86ISD::BROADCAST_LOAD once all other canonicalizations have been attempted.
Adds a DAGCombinerInfo::recursivelyDeleteUnusedNodes wrapper.
Fixes PR43217
Differential Revision: https://reviews.llvm.org/D68544
Modified:
llvm/trunk/include/llvm/CodeGen/TargetLowering.h
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll
Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=373871&r1=373870&r2=373871&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Sun Oct 6 14:11:45 2019
@@ -3263,6 +3263,8 @@ public:
SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);
+ bool recursivelyDeleteUnusedNodes(SDNode *N);
+
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
};
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=373871&r1=373870&r2=373871&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Sun Oct 6 14:11:45 2019
@@ -761,6 +761,11 @@ CombineTo(SDNode *N, SDValue Res0, SDVal
return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
}
+bool TargetLowering::DAGCombinerInfo::
+recursivelyDeleteUnusedNodes(SDNode *N) {
+ return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
+}
+
void TargetLowering::DAGCombinerInfo::
CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=373871&r1=373870&r2=373871&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Oct 6 14:11:45 2019
@@ -33429,8 +33429,19 @@ static SDValue combineTargetShuffle(SDVa
if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+ // Share broadcast with the longest vector and extract low subvector (free).
+ for (SDNode *User : Src->uses())
+ if (User != N.getNode() &&
+ (User->getOpcode() == X86ISD::VBROADCAST ||
+ User->getOpcode() == X86ISD::VBROADCAST_LOAD) &&
+ User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ return extractSubVector(SDValue(User, 0), 0, DAG, DL,
+ VT.getSizeInBits());
+ }
+
// vbroadcast(scalarload X) -> vbroadcast_load X
- if (!SrcVT.isVector() && Src.hasOneUse() &&
+ // For float loads, extract other uses of the scalar from the broadcast.
+ if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
@@ -33438,17 +33449,19 @@ static SDValue combineTargetShuffle(SDVa
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
- DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
- return BcastLd;
- }
-
- // Share broadcast with the longest vector and extract low subvector (free).
- for (SDNode *User : Src->uses())
- if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
- User->getValueSizeInBits(0) > VT.getSizeInBits()) {
- return extractSubVector(SDValue(User, 0), 0, DAG, DL,
- VT.getSizeInBits());
+ // If the load value is used only by N, replace it via CombineTo N.
+ bool NoReplaceExtract = Src.hasOneUse();
+ DCI.CombineTo(N.getNode(), BcastLd);
+ if (NoReplaceExtract) {
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ } else {
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
+ DAG.getIntPtrConstant(0, DL));
+ DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
}
+ return N; // Return N so it doesn't get rechecked!
+ }
return SDValue();
}
Modified: llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll?rev=373871&r1=373870&r2=373871&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll Sun Oct 6 14:11:45 2019
@@ -159,18 +159,14 @@ define <4 x double> @C2(double* %ptr, do
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: vmovsd %xmm0, (%eax)
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vbroadcastsd (%ecx), %ymm0
+; X32-NEXT: vmovlps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: C2:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vmovsd %xmm0, (%rsi)
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-NEXT: vmovlps %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 8
@@ -231,18 +227,14 @@ define <8 x float> @D3(float* %ptr, floa
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vbroadcastss (%ecx), %ymm0
; X32-NEXT: vmovss %xmm0, (%eax)
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D3:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: vmovss %xmm0, (%rsi)
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
@@ -285,16 +277,14 @@ define <4 x float> @e2(float* %ptr, floa
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vbroadcastss (%ecx), %xmm0
; X32-NEXT: vmovss %xmm0, (%eax)
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: e2:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: vmovss %xmm0, (%rsi)
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
entry:
%q = load float, float* %ptr, align 4
@@ -669,16 +659,14 @@ define <2 x double> @I2(double* %ptr, do
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: vmovsd %xmm0, (%eax)
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT: vmovlps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: I2:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vmovsd %xmm0, (%rsi)
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT: vmovlps %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load double, double* %ptr, align 4
@@ -884,7 +872,6 @@ define void @broadcast_v16i32(i32* %a, <
;
; Broadcast scale factor for xyz vector - slp will have vectorized xy.
-; FIXME: Load as a broadcast and then use the scalar 0'th element.
;
define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture readonly) nounwind {
; X32-LABEL: broadcast_scale_xyz:
@@ -892,9 +879,8 @@ define double @broadcast_scale_xyz(doubl
; X32-NEXT: subl $12, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; X32-NEXT: vmulpd (%eax), %xmm1, %xmm1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT: vmulpd (%eax), %xmm0, %xmm1
; X32-NEXT: vmulsd 16(%eax), %xmm0, %xmm0
; X32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X32-NEXT: vaddsd %xmm2, %xmm1, %xmm1
@@ -906,9 +892,8 @@ define double @broadcast_scale_xyz(doubl
;
; X64-LABEL: broadcast_scale_xyz:
; X64: ## %bb.0:
-; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; X64-NEXT: vmulpd (%rsi), %xmm1, %xmm1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1
; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1
More information about the llvm-commits
mailing list