[llvm] fb1d61b - [X86][AVX] Fold concat(ps*lq(x,32),ps*lq(y,32)) -> shuffle(concat(x,y),zero) (PR46621)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed May 12 10:05:01 PDT 2021
Author: Simon Pilgrim
Date: 2021-05-12T18:04:40+01:00
New Revision: fb1d61b7257ccd5ba0c96bcea78d6516384ce5b6
URL: https://github.com/llvm/llvm-project/commit/fb1d61b7257ccd5ba0c96bcea78d6516384ce5b6
DIFF: https://github.com/llvm/llvm-project/commit/fb1d61b7257ccd5ba0c96bcea78d6516384ce5b6.diff
LOG: [X86][AVX] Fold concat(ps*lq(x,32),ps*lq(y,32)) -> shuffle(concat(x,y),zero) (PR46621)
On AVX1 targets we can handle v4i64 logical shifts by 32 bits as a pair of v8f32 shuffles with zero.
I was hoping to put this in LowerScalarImmediateShift, but performing that early causes regressions where other instructions were respliting the subvectors.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vec_int_to_fp.ll
llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
llvm/test/CodeGen/X86/vector-shift-shl-256.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fc12f88b9701..01d6a3c369e5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49703,8 +49703,26 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
break;
case X86ISD::VSHLI:
- case X86ISD::VSRAI:
case X86ISD::VSRLI:
+ // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
+ // TODO: Move this to LowerScalarImmediateShift?
+ if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getConstantOperandAPInt(1) == 32;
+ })) {
+ SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
+ SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
+ if (Op0.getOpcode() == X86ISD::VSHLI) {
+ Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+ {8, 0, 8, 2, 8, 4, 8, 6});
+ } else {
+ Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+ {1, 8, 3, 8, 5, 8, 7, 8});
+ }
+ return DAG.getBitcast(VT, Res);
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VSRAI:
if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 9361af10962e..08f65b94af6e 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -930,15 +930,13 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
; AVX1-LABEL: uitofp_4i64_to_4f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_4i64_to_4f64:
@@ -3670,17 +3668,15 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
;
; AVX1-LABEL: uitofp_load_4i64_to_4f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_4i64_to_4f64:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 9fd0960c16b2..ffa8f4d85ade 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1397,10 +1397,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: shift32_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shift32_v4i64:
@@ -1410,10 +1409,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
;
; XOPAVX1-LABEL: shift32_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shift32_v4i64:
@@ -1433,10 +1431,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
;
; X86-AVX1-LABEL: shift32_v4i64:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: shift32_v4i64:
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 0af23983b817..0d1b26fd08a0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1306,10 +1306,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: shift32_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shift32_v4i64:
@@ -1319,10 +1318,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
;
; XOPAVX1-LABEL: shift32_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shift32_v4i64:
@@ -1342,10 +1340,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
;
; X86-AVX1-LABEL: shift32_v4i64:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: shift32_v4i64:
More information about the llvm-commits
mailing list