[llvm] 49fd2dd - [X86] LowerShift - don't prematurely lower to x86 vector shift imm instructions (#120282)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 18 08:08:50 PST 2024
Author: Simon Pilgrim
Date: 2024-12-18T16:08:45Z
New Revision: 49fd2dde21655f95309abb17ad1d3392afe4985f
URL: https://github.com/llvm/llvm-project/commit/49fd2dde21655f95309abb17ad1d3392afe4985f
DIFF: https://github.com/llvm/llvm-project/commit/49fd2dde21655f95309abb17ad1d3392afe4985f.diff
LOG: [X86] LowerShift - don't prematurely lower to x86 vector shift imm instructions (#120282)
When splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets.
Noticed on #120270
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/lower-vec-shift.ll
llvm/test/CodeGen/X86/vec_shift6.ll
llvm/test/CodeGen/X86/widen_arith-4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ad5e2e7ea83bbb..35c0974733aba4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30105,9 +30105,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
canWidenShuffleElements(ShuffleMask))) {
SDValue Shift1 =
- getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtA, DAG);
+ DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
SDValue Shift2 =
- getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtB, DAG);
+ DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
}
}
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 76f21604500482..42f09d04da26ed 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2190,7 +2190,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: psllw $1, %xmm2
+; SSE41-NEXT: paddw %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
@@ -2202,9 +2202,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE41-NEXT: psraw $8, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $1, %xmm3
-; SSE41-NEXT: psllw $7, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
+; SSE41-NEXT: psllw $7, %xmm3
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: psraw $8, %xmm2
@@ -2225,7 +2225,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -2235,9 +2235,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3
-; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/lower-vec-shift.ll b/llvm/test/CodeGen/X86/lower-vec-shift.ll
index 67e0c1b3cf2b3e..9d4935ef564def 100644
--- a/llvm/test/CodeGen/X86/lower-vec-shift.ll
+++ b/llvm/test/CodeGen/X86/lower-vec-shift.ll
@@ -265,11 +265,11 @@ define <16 x i16> @test11(<16 x i16> %a) {
; AVX1-LABEL: test11:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
+; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6],xmm1[7]
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -294,10 +294,10 @@ define <16 x i16> @test12(<16 x i16> %a) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -305,7 +305,7 @@ define <16 x i16> @test12(<16 x i16> %a) {
; AVX2-LABEL: test12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
-; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
; AVX2-NEXT: retq
%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll
index 48ed39e5da88f2..71e659c681d17d 100644
--- a/llvm/test/CodeGen/X86/vec_shift6.ll
+++ b/llvm/test/CodeGen/X86/vec_shift6.ll
@@ -28,20 +28,20 @@ define <8 x i16> @test2(<8 x i16> %a) {
; SSE2-LABEL: test2:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllw $1, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test2:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllw $1, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: test2:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $1, %xmm0, %xmm1
+; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm1
; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX-NEXT: retq
%shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
@@ -56,7 +56,7 @@ define <4 x i32> @test3(<4 x i32> %a) {
; SSE2-LABEL: test3:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $1, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pslld $2, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
@@ -65,7 +65,7 @@ define <4 x i32> @test3(<4 x i32> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pslld $2, %xmm1
-; SSE41-NEXT: pslld $1, %xmm0
+; SSE41-NEXT: paddd %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
@@ -81,14 +81,14 @@ define <4 x i32> @test4(<4 x i32> %a) {
; SSE2-LABEL: test4:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $1, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test4:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pslld $1, %xmm1
+; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll
index c49882ffe0b389..ea6bf66fd2923a 100644
--- a/llvm/test/CodeGen/X86/widen_arith-4.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-4.ll
@@ -65,7 +65,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psllw $2, %xmm2
-; SSE41-NEXT: psllw $1, %xmm1
+; SSE41-NEXT: paddw %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax)
; SSE41-NEXT: movq %xmm2, (%rcx,%rax)
More information about the llvm-commits
mailing list