[llvm] r359786 - [X86][SSE] lowerAddSubToHorizontalOp - enable ymm extraction+fold
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu May 2 07:00:56 PDT 2019
Author: rksimon
Date: Thu May 2 07:00:55 2019
New Revision: 359786
URL: http://llvm.org/viewvc/llvm-project?rev=359786&view=rev
Log:
[X86][SSE] lowerAddSubToHorizontalOp - enable ymm extraction+fold
Limiting scalar hadd/hsub generation to the lowest xmm looks to be unnecessary - we will be extracting one upper xmm whatever, and we can remove a shuffle by using the hop which is inline with what shouldUseHorizontalOp expects to happen anyway.
Testing on btver2 (the main target for fast-hops) shows this is beneficial even for float ops where we have a 'shuffle' to extract the float result:
https://godbolt.org/z/0R-U-K
Differential Revision: https://reviews.llvm.org/D61426
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/haddsub.ll
llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=359786&r1=359785&r2=359786&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu May 2 07:00:55 2019
@@ -19037,13 +19037,12 @@ static SDValue lowerAddSubToHorizontalOp
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
- // This is free if we're extracting from the bottom lane: ymm/zmm -> xmm.
- if (NumEltsPerLane <= LExtIndex)
- return Op;
-
SDLoc DL(Op);
- if (BitWidth == 256 || BitWidth == 512)
- X = extract128BitVector(X, 0, DAG, DL);
+ if (BitWidth == 256 || BitWidth == 512) {
+ unsigned LaneIdx = LExtIndex / NumEltsPerLane;
+ X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
+ LExtIndex %= NumEltsPerLane;
+ }
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
Modified: llvm/trunk/test/CodeGen/X86/haddsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub.ll?rev=359786&r1=359785&r2=359786&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub.ll Thu May 2 07:00:55 2019
@@ -1004,14 +1004,22 @@ define float @extract_extract67_v8f32_fa
; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract67_v8f32_fadd_f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <8 x float> %x, i32 6
%x1 = extractelement <8 x float> %x, i32 7
%x01 = fadd float %x0, %x1
@@ -1098,14 +1106,22 @@ define float @extract_extract67_v8f32_fa
; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract67_v8f32_fadd_f32_commute:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <8 x float> %x, i32 6
%x1 = extractelement <8 x float> %x, i32 7
%x01 = fadd float %x1, %x0
@@ -1158,13 +1174,20 @@ define double @extract_extract23_v4f64_f
; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract23_v4f64_fadd_f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <4 x double> %x, i32 2
%x1 = extractelement <4 x double> %x, i32 3
%x01 = fadd double %x0, %x1
@@ -1217,13 +1240,20 @@ define double @extract_extract23_v4f64_f
; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract23_v4f64_fadd_f64_commute:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <4 x double> %x, i32 2
%x1 = extractelement <4 x double> %x, i32 3
%x01 = fadd double %x1, %x0
@@ -1310,13 +1340,20 @@ define float @extract_extract45_v8f32_fs
; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract45_v8f32_fsub_f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <8 x float> %x, i32 4
%x1 = extractelement <8 x float> %x, i32 5
%x01 = fsub float %x0, %x1
Modified: llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll?rev=359786&r1=359785&r2=359786&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll Thu May 2 07:00:55 2019
@@ -878,9 +878,8 @@ define i16 @extract_extract89_v16i16_add
; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX1-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-FAST-NEXT: addl %ecx, %eax
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
@@ -898,9 +897,8 @@ define i16 @extract_extract89_v16i16_add
; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX2-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-FAST-NEXT: addl %ecx, %eax
+; AVX2-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
@@ -918,9 +916,8 @@ define i16 @extract_extract89_v16i16_add
; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX512-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX512-FAST-NEXT: addl %ecx, %eax
+; AVX512-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
; AVX512-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
@@ -1035,9 +1032,8 @@ define i16 @extract_extract89_v16i16_add
; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX1-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-FAST-NEXT: addl %ecx, %eax
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
@@ -1055,9 +1051,8 @@ define i16 @extract_extract89_v16i16_add
; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX2-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-FAST-NEXT: addl %ecx, %eax
+; AVX2-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
@@ -1075,9 +1070,8 @@ define i16 @extract_extract89_v16i16_add
; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX512-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX512-FAST-NEXT: addl %ecx, %eax
+; AVX512-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
; AVX512-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
More information about the llvm-commits
mailing list