[llvm] [X86] combineTargetShuffle - fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y) (PR #141579)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue May 27 04:09:00 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/141579
Move VZEXT_MOVL nodes up through shift nodes.
We should be trying harder to move VZEXT_MOVL towards any associated SCALAR_TO_VECTOR nodes to make use of MOVD/Q implicit zeroing of upper elements.
Fixes #141475
>From 19402113ba65633eff05ae23d1025242b38159bf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 27 May 2025 12:08:03 +0100
Subject: [PATCH] [X86] combineTargetShuffle - fold (vzmovl (shift x, y)) ->
(shift (vzmovl x), y)
Move VZEXT_MOVL nodes up through shift nodes.
We should be trying harder to move VZEXT_MOVL towards any associated SCALAR_TO_VECTOR nodes to make use of MOVD/Q implicit zeroing of upper elements.
Fixes #141475
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 14 +++++
.../X86/codegen-no-uselist-constantdata.ll | 6 +-
.../CodeGen/X86/urem-seteq-illegal-types.ll | 19 +++---
llvm/test/CodeGen/X86/vec_insert-5.ll | 4 +-
.../CodeGen/X86/vector-shuffle-combining.ll | 62 ++++---------------
5 files changed, 39 insertions(+), 66 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6126f568aa1e1..08f2109819a0f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42368,6 +42368,20 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
case X86ISD::VZEXT_MOVL: {
SDValue N0 = N.getOperand(0);
+ // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
+ // Zeroing out the upper elements means we're just shifting them by zero.
+ // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
+ // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
+ if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
+ N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
+ N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
+ if (N0.hasOneUse())
+ return DAG.getNode(
+ N0.getOpcode(), DL, VT,
+ DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
+ N0.getOperand(1));
+ }
+
// If this a vzmovl of a full vector load, replace it with a vzload, unless
// the load is volatile.
if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
diff --git a/llvm/test/CodeGen/X86/codegen-no-uselist-constantdata.ll b/llvm/test/CodeGen/X86/codegen-no-uselist-constantdata.ll
index fef2c18b30a82..d2a755919a254 100644
--- a/llvm/test/CodeGen/X86/codegen-no-uselist-constantdata.ll
+++ b/llvm/test/CodeGen/X86/codegen-no-uselist-constantdata.ll
@@ -36,10 +36,8 @@ define <16 x i8> @load_null_offset() {
; CHECK-LABEL: load_null_offset:
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl 11, %eax
-; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: pslld $8, %xmm1
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: pslld $8, %xmm0
; CHECK-NEXT: retq
%gep.null = getelementptr i8, ptr null, i64 11
%load = load i8, ptr %gep.null, align 1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 97cc1f8a15694..7c1a1e285ca05 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -147,15 +147,14 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
-; SSE2-NEXT: pslld $10, %xmm0
-; SSE2-NEXT: xorps %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
-; SSE2-NEXT: orps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT: pslld $10, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -175,9 +174,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: psrld $1, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: pslld $10, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; SSE41-NEXT: pslld $10, %xmm3
; SSE41-NEXT: por %xmm2, %xmm3
; SSE41-NEXT: pand %xmm1, %xmm3
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
@@ -200,9 +199,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
-; AVX1-NEXT: vpslld $10, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX1-NEXT: vpslld $10, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index 91743898545ee..ddde9ecb7c0fd 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -10,16 +10,16 @@ define void @t1(i32 %a, ptr %P) nounwind {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; X86-NEXT: pslld $12, %xmm0
-; X86-NEXT: psllq $32, %xmm0
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: t1:
; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: pslld $12, %xmm0
; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: pslld $12, %xmm0
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
%tmp12 = shl i32 %a, 12
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index f216bcacfe04a..ef8893dc4caf0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3547,57 +3547,19 @@ define <16 x i8> @PR107289(<16 x i8> %0) {
}
define <8 x i16> @PR141475(i32 %in) {
-; SSE2-LABEL: PR141475:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pslld $1, %xmm0
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: PR141475:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movd %edi, %xmm0
-; SSSE3-NEXT: pslld $1, %xmm0
-; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: PR141475:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pslld $1, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: PR141475:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: PR141475:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
-; AVX2-SLOW-NEXT: vpslld $1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: retq
+; SSE-LABEL: PR141475:
+; SSE: # %bb.0:
+; SSE-NEXT: movd %edi, %xmm0
+; SSE-NEXT: pslld $1, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: retq
;
-; AVX2-FAST-LABEL: PR141475:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovd %edi, %xmm0
-; AVX2-FAST-NEXT: vpslld $1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: retq
+; AVX-LABEL: PR141475:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX-NEXT: retq
%mul = shl i32 %in, 1
%vecinit = insertelement <4 x i32> zeroinitializer, i32 %mul, i64 0
%cast = bitcast <4 x i32> %vecinit to <8 x i16>
More information about the llvm-commits
mailing list