[llvm] r369942 - [X86] Add a hack to combinePMULDQ to manually turn SIGN_EXTEND_VECTOR_INREG/ZERO_EXTEND_VECTOR_INREG inputs into an ANY_EXTEND_VECTOR_INREG style shuffle
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 26 11:23:26 PDT 2019
Author: ctopper
Date: Mon Aug 26 11:23:26 2019
New Revision: 369942
URL: http://llvm.org/viewvc/llvm-project?rev=369942&view=rev
Log:
[X86] Add a hack to combinePMULDQ to manually turn SIGN_EXTEND_VECTOR_INREG/ZERO_EXTEND_VECTOR_INREG inputs into an ANY_EXTEND_VECTOR_INREG style shuffle
ANY_EXTEND_VECTOR_INREG isn't currently marked Legal which prevents SimplifyDemandedBits from turning SIGN/ZERO_EXTEND_VECTOR_INREG into it after op legalization. And even if we did make it Legal, combineExtInVec doesn't do shuffle combining on the VECTOR_INREG nodes until AVX1.
This patch adds a quick hack to combinePMULDQ to directly emit a vector shuffle corresponding to an ANY_EXTEND_VECTOR_INREG operation. This avoids both of those issues without creating any other regressions on our tests. The xop-ifma.ll change here also showed up when I tried to resurrect D56306 and seemed to be the only improvement that patch creates now. This is a more direct way to get the benefit.
Differential Revision: https://reviews.llvm.org/D66436
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/pmul.ll
llvm/trunk/test/CodeGen/X86/xop-ifma.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=369942&r1=369941&r2=369942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Aug 26 11:23:26 2019
@@ -44659,6 +44659,34 @@ static SDValue combinePMULDQ(SDNode *N,
if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
return SDValue(N, 0);
+ // If the input is an extend_invec and the SimplifyDemandedBits call didn't
+ // convert it to any_extend_invec, due to the LegalOperations check, do the
+ // conversion directly to a vector shuffle manually. This exposes combine
+ // opportunities missed by combineExtInVec not calling
+ // combineX86ShufflesRecursively on SSE4.1 targets.
+ // FIXME: This is basically a hack around several other issues related to
+ // ANY_EXTEND_VECTOR_INREG.
+ if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
+ (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ LHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
+ LHS.getOperand(0), { 0, -1, 1, -1 });
+ LHS = DAG.getBitcast(MVT::v2i64, LHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+ if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
+ (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ RHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
+ RHS.getOperand(0), { 0, -1, 1, -1 });
+ RHS = DAG.getBitcast(MVT::v2i64, RHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+
return SDValue();
}
Modified: llvm/trunk/test/CodeGen/X86/pmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmul.ll?rev=369942&r1=369941&r2=369942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmul.ll Mon Aug 26 11:23:26 2019
@@ -1131,14 +1131,13 @@ define <4 x i32> @mul_v4i64_zero_lower(<
;
; SSE41-LABEL: mul_v4i64_zero_lower:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: psrlq $32, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
; SSE41-NEXT: psrlq $32, %xmm1
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pmuludq %xmm1, %xmm0
-; SSE41-NEXT: psrlq $32, %xmm2
-; SSE41-NEXT: pmuludq %xmm3, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i64_zero_lower:
Modified: llvm/trunk/test/CodeGen/X86/xop-ifma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/xop-ifma.ll?rev=369942&r1=369941&r2=369942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/xop-ifma.ll (original)
+++ llvm/trunk/test/CodeGen/X86/xop-ifma.ll Mon Aug 26 11:23:26 2019
@@ -67,15 +67,13 @@ define <8 x i32> @test_mul_v8i32_add_v8i
define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i64> %a2) {
; XOP-AVX1-LABEL: test_mulx_v4i32_add_v4i64:
; XOP-AVX1: # %bb.0:
-; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
-; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm4
-; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; XOP-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; XOP-AVX1-NEXT: vpmacsdql %xmm5, %xmm1, %xmm0, %xmm0
-; XOP-AVX1-NEXT: vpmacsdql %xmm2, %xmm4, %xmm3, %xmm1
+; XOP-AVX1-NEXT: vpmacsdql %xmm2, %xmm3, %xmm4, %xmm1
; XOP-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; XOP-AVX1-NEXT: retq
;
More information about the llvm-commits
mailing list