[llvm-branch-commits] [llvm] 19d0284 - [X86][AVX] Fold extract_subvector(VSRLI/VSHLI(x, 32)) -> VSRLI/VSHLI(extract_subvector(x), 32)
Simon Pilgrim via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jan 20 06:39:44 PST 2021
Author: Simon Pilgrim
Date: 2021-01-20T14:34:54Z
New Revision: 19d02842ee56089b9208875ce4582e113e08fb6d
URL: https://github.com/llvm/llvm-project/commit/19d02842ee56089b9208875ce4582e113e08fb6d
DIFF: https://github.com/llvm/llvm-project/commit/19d02842ee56089b9208875ce4582e113e08fb6d.diff
LOG: [X86][AVX] Fold extract_subvector(VSRLI/VSHLI(x,32)) -> VSRLI/VSHLI(extract_subvector(x),32)
As discussed on D56387, if we're shifting to extract the upper/lower half of a vXi64 vector then we're actually better off performing this at the subvector level as its very likely to fold into something.
combineConcatVectorOps can perform this in reverse if necessary.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/combine-sra.ll
llvm/test/CodeGen/X86/pmul.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0ee671710219..0b52b2021c73 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49799,8 +49799,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
+ unsigned InOpcode = InVec.getOpcode();
if (IdxVal == 0 && InVec.hasOneUse()) {
- unsigned InOpcode = InVec.getOpcode();
if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
@@ -49853,6 +49853,17 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
}
}
+ // Always split vXi64 logical shifts where we're extracting the upper 32-bits
+ // as this is very likely to fold into a shuffle/truncation.
+ if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
+ InVecVT.getScalarSizeInBits() == 64 &&
+ InVec.getConstantOperandAPInt(1) == 32) {
+ SDLoc DL(N);
+ SDValue Ext =
+ extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 28a73cdb6a41..453a61b8565e 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -207,9 +207,8 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index db6009f273d2..56476eea323e 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1150,9 +1150,8 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index a274baefc1ef..f0cb46e63d8f 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -834,19 +834,20 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
;
; AVX2-64-LABEL: uitofp_v4i64_v4f64:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm1
-; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-64-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX2-64-NEXT: vmovq %xmm1, %rax
+; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-64-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
; AVX2-64-NEXT: vmovq %xmm2, %rax
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
-; AVX2-64-NEXT: vmovq %xmm1, %rax
-; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
-; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9]
; AVX2-64-NEXT: vmulpd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
More information about the llvm-branch-commits
mailing list