[llvm] r366268 - [x86] use more phadd for reductions
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 16 14:30:41 PDT 2019
Author: spatel
Date: Tue Jul 16 14:30:41 2019
New Revision: 366268
URL: http://llvm.org/viewvc/llvm-project?rev=366268&view=rev
Log:
[x86] use more phadd for reductions
This is part of what is requested by PR42023:
https://bugs.llvm.org/show_bug.cgi?id=42023
There's an extension needed for FP add, but exactly how we would specify
that using flags is not clear to me, so I left that as a TODO.
We're still missing patterns for partial reductions when the input vector
is 256-bit or 512-bit, but I think that's a failure of vector narrowing.
If we can reduce the widths, then this matching should work on those tests.
Differential Revision: https://reviews.llvm.org/D64760
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll
llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll
llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=366268&r1=366267&r2=366268&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Jul 16 14:30:41 2019
@@ -35624,6 +35624,57 @@ static SDValue scalarizeExtEltFP(SDNode
llvm_unreachable("All opcodes should return within switch");
}
+/// Try to convert a vector reduction sequence composed of binops and shuffles
+/// into horizontal ops.
+static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ return SDValue();
+ SDValue Index = ExtElt->getOperand(1);
+ if (!isNullConstant(Index))
+ return SDValue();
+
+ // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
+ ISD::NodeType Opc;
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+ if (!Rdx)
+ return SDValue();
+
+ EVT VT = ExtElt->getValueType(0);
+ EVT VecVT = ExtElt->getOperand(0).getValueType();
+ if (VecVT.getScalarType() != VT)
+ return SDValue();
+
+ unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+ SDLoc DL(ExtElt);
+
+ // 256-bit horizontal instructions operate on 128-bit chunks rather than
+ // across the whole vector, so we need an extract + hop preliminary stage.
+ // This is the only step where the operands of the hop are not the same value.
+ // TODO: We could extend this to handle 512-bit or even longer vectors.
+ if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
+ ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
+ SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
+ VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+ }
+ if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
+ !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
+ return SDValue();
+
+ // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
+ assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
+ unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
+ for (unsigned i = 0; i != ReductionSteps; ++i)
+ Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+}
+
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
@@ -35710,6 +35761,9 @@ static SDValue combineExtractVectorElt(S
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
+ if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
+ return V;
+
if (SDValue V = scalarizeExtEltFP(N, DAG))
return V;
Modified: llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll?rev=366268&r1=366267&r2=366268&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll Tue Jul 16 14:30:41 2019
@@ -1903,10 +1903,8 @@ define i16 @hadd16_8(<8 x i16> %x223) {
;
; SSE3-FAST-LABEL: hadd16_8:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE3-FAST-NEXT: paddw %xmm0, %xmm1
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE3-FAST-NEXT: paddw %xmm1, %xmm0
+; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSE3-FAST-NEXT: movd %xmm0, %eax
; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1926,10 +1924,8 @@ define i16 @hadd16_8(<8 x i16> %x223) {
;
; AVX-FAST-LABEL: hadd16_8:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vmovd %xmm0, %eax
; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -1956,10 +1952,9 @@ define i32 @hadd32_4(<4 x i32> %x225) {
;
; SSE3-FAST-LABEL: hadd32_4:
; SSE3-FAST: # %bb.0:
-; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE3-FAST-NEXT: paddd %xmm0, %xmm1
-; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1
-; SSE3-FAST-NEXT: movd %xmm1, %eax
+; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSE3-FAST-NEXT: movd %xmm0, %eax
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: hadd32_4:
@@ -1973,8 +1968,7 @@ define i32 @hadd32_4(<4 x i32> %x225) {
;
; AVX-FAST-LABEL: hadd32_4:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vmovd %xmm0, %eax
; AVX-FAST-NEXT: retq
@@ -2097,10 +2091,8 @@ define i32 @hadd32_16(<16 x i32> %x225)
define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize {
; SSE3-LABEL: hadd16_8_optsize:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE3-NEXT: paddw %xmm0, %xmm1
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE3-NEXT: paddw %xmm1, %xmm0
+; SSE3-NEXT: phaddw %xmm0, %xmm0
+; SSE3-NEXT: phaddw %xmm0, %xmm0
; SSE3-NEXT: phaddw %xmm0, %xmm0
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: # kill: def $ax killed $ax killed $eax
@@ -2108,10 +2100,8 @@ define i16 @hadd16_8_optsize(<8 x i16> %
;
; AVX-LABEL: hadd16_8_optsize:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -2129,16 +2119,14 @@ define i16 @hadd16_8_optsize(<8 x i16> %
define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize {
; SSE3-LABEL: hadd32_4_optsize:
; SSE3: # %bb.0:
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE3-NEXT: paddd %xmm0, %xmm1
-; SSE3-NEXT: phaddd %xmm1, %xmm1
-; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: phaddd %xmm0, %xmm0
+; SSE3-NEXT: phaddd %xmm0, %xmm0
+; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_4_optsize:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll?rev=366268&r1=366267&r2=366268&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll Tue Jul 16 14:30:41 2019
@@ -254,8 +254,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX1-FAST-LABEL: test_v4i32:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: retq
@@ -307,9 +306,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1-FAST-LABEL: test_v8i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: vzeroupper
@@ -635,10 +633,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX1-FAST-LABEL: test_v8i16:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -704,11 +700,9 @@ define i16 @test_v16i16(<16 x i16> %a0)
; AVX1-FAST-LABEL: test_v16i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll?rev=366268&r1=366267&r2=366268&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll Tue Jul 16 14:30:41 2019
@@ -241,8 +241,7 @@ define i32 @test_v4i32(<4 x i32> %a0) {
;
; AVX1-FAST-LABEL: test_v4i32:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: retq
@@ -294,9 +293,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1-FAST-LABEL: test_v8i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: vzeroupper
@@ -605,10 +603,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
;
; AVX1-FAST-LABEL: test_v8i16:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -674,11 +670,9 @@ define i16 @test_v16i16(<16 x i16> %a0)
; AVX1-FAST-LABEL: test_v16i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
More information about the llvm-commits
mailing list