[llvm] r350928 - [x86] allow insert/extract when matching horizontal ops
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 11 06:27:59 PST 2019
Author: spatel
Date: Fri Jan 11 06:27:59 2019
New Revision: 350928
URL: http://llvm.org/viewvc/llvm-project?rev=350928&view=rev
Log:
[x86] allow insert/extract when matching horizontal ops
Previously, we limited this transform to cases where the
extraction into the build vector happens from vectors of
the same type as the build vector, but that's not required.
There's a slight potential regression seen in the AVX512
result for phadd -- we're using the 256-bit flavor of the
instruction now even though the 128-bit subset is sufficient.
The same problem could already be seen in the AVX2 result.
Follow-up patches will attempt to narrow that back down.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/haddsub-undef.ll
llvm/trunk/test/CodeGen/X86/phaddsub-undef.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=350928&r1=350927&r2=350928&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jan 11 06:27:59 2019
@@ -8307,10 +8307,20 @@ static bool isHopBuildVector(const Build
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
SelectionDAG &DAG, unsigned HOpcode,
SDValue V0, SDValue V1) {
- // TODO: We should extract/insert to match the size of the build vector.
+ // If either input vector is not the same size as the build vector,
+ // extract/insert the low bits to the correct size.
+ // This is free (examples: zmm --> xmm, xmm --> ymm).
MVT VT = BV->getSimpleValueType(0);
- if (V0.getValueType() != VT || V1.getValueType() != VT)
- return SDValue();
+ unsigned Width = VT.getSizeInBits();
+ if (V0.getValueSizeInBits() > Width)
+ V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
+ else if (V0.getValueSizeInBits() < Width)
+ V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
+
+ if (V1.getValueSizeInBits() > Width)
+ V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
+ else if (V1.getValueSizeInBits() < Width)
+ V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
}
Modified: llvm/trunk/test/CodeGen/X86/haddsub-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-undef.ll?rev=350928&r1=350927&r2=350928&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-undef.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-undef.ll Fri Jan 11 06:27:59 2019
@@ -737,21 +737,11 @@ define <4 x float> @v8f32_inputs_v4f32_o
; SSE-NEXT: haddps %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-SLOW-LABEL: v8f32_inputs_v4f32_output_0101:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vaddss %xmm3, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
-;
-; AVX-FAST-LABEL: v8f32_inputs_v4f32_output_0101:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
%b0 = extractelement <8 x float> %b, i32 0
@@ -769,26 +759,11 @@ define <4 x float> @v8f32_input0_v4f32_o
; SSE-NEXT: haddps %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-SLOW-LABEL: v8f32_input0_v4f32_output_0123:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
-; AVX-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
-;
-; AVX-FAST-LABEL: v8f32_input0_v4f32_output_0123:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-FAST-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX-LABEL: v8f32_input0_v4f32_output_0123:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
%b2 = extractelement <4 x float> %b, i32 2
@@ -806,28 +781,11 @@ define <4 x float> @v8f32_input1_v4f32_o
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-SLOW-LABEL: v8f32_input1_v4f32_output_2301:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
-;
-; AVX-FAST-LABEL: v8f32_input1_v4f32_output_2301:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0
-; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
-; AVX-FAST-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX-LABEL: v8f32_input1_v4f32_output_2301:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a2 = extractelement <4 x float> %a, i32 2
%a3 = extractelement <4 x float> %a, i32 3
%b0 = extractelement <8 x float> %b, i32 0
@@ -847,14 +805,7 @@ define <4 x float> @v8f32_inputs_v4f32_o
;
; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%a2 = extractelement <8 x float> %a, i32 2
@@ -876,45 +827,21 @@ define <4 x float> @v16f32_inputs_v4f32_
;
; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vzeroupper
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
;
-; AVX512-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
-; AVX512-SLOW: # %bb.0:
-; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
-; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512-SLOW-NEXT: vzeroupper
-; AVX512-SLOW-NEXT: retq
-;
-; AVX512-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
-; AVX512-FAST: # %bb.0:
-; AVX512-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-FAST-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX512-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
-; AVX512-FAST-NEXT: vzeroupper
-; AVX512-FAST-NEXT: retq
+; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a0 = extractelement <16 x float> %a, i32 0
%a1 = extractelement <16 x float> %a, i32 1
%b2 = extractelement <16 x float> %b, i32 2
@@ -944,15 +871,7 @@ define <8 x float> @v16f32_inputs_v8f32_
;
; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT: vaddss %xmm1, %xmm3, %xmm1
-; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%a4 = extractelement <16 x float> %a, i32 4
%a5 = extractelement <16 x float> %a, i32 5
Modified: llvm/trunk/test/CodeGen/X86/phaddsub-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/phaddsub-undef.ll?rev=350928&r1=350927&r2=350928&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/phaddsub-undef.ll (original)
+++ llvm/trunk/test/CodeGen/X86/phaddsub-undef.ll Fri Jan 11 06:27:59 2019
@@ -176,7 +176,7 @@ define <16 x i32> @test16_v16i32_undef(<
;
; AVX512-LABEL: test16_v16i32_undef:
; AVX512: # %bb.0:
-; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: retq
%vecext = extractelement <16 x i32> %a, i32 0
%vecext1 = extractelement <16 x i32> %a, i32 1
@@ -252,7 +252,7 @@ define <16 x i32> @test17_v16i32_undef(<
; AVX512-LABEL: test17_v16i32_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%vecext = extractelement <16 x i32> %a, i32 0
%vecext1 = extractelement <16 x i32> %a, i32 1
More information about the llvm-commits
mailing list