[llvm-branch-commits] [llvm] 4214ca9 - [X86][AVX] Attempt to fold vpermf128(op(x, i), op(y, i)) -> op(vpermf128(x, y), i)
Simon Pilgrim via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 11 09:13:25 PST 2021
Author: Simon Pilgrim
Date: 2021-01-11T16:59:25Z
New Revision: 4214ca96145c9487407925b121b85fafb1179209
URL: https://github.com/llvm/llvm-project/commit/4214ca96145c9487407925b121b85fafb1179209
DIFF: https://github.com/llvm/llvm-project/commit/4214ca96145c9487407925b121b85fafb1179209.diff
LOG: [X86][AVX] Attempt to fold vpermf128(op(x,i),op(y,i)) -> op(vpermf128(x,y),i)
If vpermf128/vpermi128 is acting on 2 similar 'inlane' ops, then try to perform the vpermf128 first which will allow us to merge the ops.
This will help us fix one of the regressions in D56387
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7895f883863f..2f9de876a87f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36665,6 +36665,43 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
return SDValue();
}
+/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
+static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
+ SelectionDAG &DAG,
+ const SDLoc &DL) {
+ assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
+
+ MVT VT = V.getSimpleValueType();
+ SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
+ SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
+ unsigned SrcOpc0 = Src0.getOpcode();
+ unsigned SrcOpc1 = Src1.getOpcode();
+ EVT SrcVT0 = Src0.getValueType();
+ EVT SrcVT1 = Src1.getValueType();
+
+ // TODO: Under what circumstances should we push perm2f128 up when we have one
+ // active src?
+ if (SrcOpc0 != SrcOpc1 || SrcVT0 != SrcVT1)
+ return SDValue();
+
+ switch (SrcOpc0) {
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI:
+ if (Src0.getOperand(1) == Src1.getOperand(1)) {
+ SDValue Res = DAG.getNode(
+ X86ISD::VPERM2X128, DL, VT, DAG.getBitcast(VT, Src0.getOperand(0)),
+ DAG.getBitcast(VT, Src1.getOperand(0)), V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
+ Src0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ }
+
+ return SDValue();
+}
+
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -37045,6 +37082,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
case X86ISD::VPERM2X128: {
+ if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
+ return Res;
+
// If both 128-bit values were inserted into high halves of 256-bit values,
// the shuffle can be reduced to a concatenation of subvectors:
// vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
@@ -37053,6 +37093,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
unsigned Imm = N.getConstantOperandVal(2);
+
if (!(Imm == 0x31 &&
Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index bd8b7dd355cc..f35e315bbb0b 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -107,11 +107,9 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
More information about the llvm-branch-commits
mailing list