[llvm] 70bd80d - [X86] combineTargetShuffle - commute VPERMV3 shuffles so any load is on the RHS
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 23 07:33:40 PST 2024
Author: Simon Pilgrim
Date: 2024-11-23T15:27:10Z
New Revision: 70bd80dc51b62453210f6203c31ea826dd0675c2
URL: https://github.com/llvm/llvm-project/commit/70bd80dc51b62453210f6203c31ea826dd0675c2
DIFF: https://github.com/llvm/llvm-project/commit/70bd80dc51b62453210f6203c31ea826dd0675c2.diff
LOG: [X86] combineTargetShuffle - commute VPERMV3 shuffles so any load is on the RHS
This helps ensure we lower to VPERMI2/T2 instructions that we can commute the index arg to VPERMT2/I2.
Similar to 1e31a4529244ead9f12abed524f33a48515abee2 to handle cases where the one use load appears after further folding (keep the lowerShuffleWithPERMV version as this can handle the non-VLX widening case as well).
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/vector-shuffle-v1.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ff2b9199384db4..9048d1d83f1874 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42252,6 +42252,17 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
DAG.getIntPtrConstant(0, DL));
}
}
+ SmallVector<SDValue, 2> Ops;
+ SmallVector<int, 32> Mask;
+ if (isShuffleFoldableLoad(N.getOperand(0)) &&
+ !isShuffleFoldableLoad(N.getOperand(2)) &&
+ getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ SDValue NewMask = getConstVector(
+ Mask, N.getOperand(1).getSimpleValueType(), DAG, DL, /*IsMask=*/true);
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, N.getOperand(2), NewMask,
+ N.getOperand(0));
+ }
return SDValue();
}
default:
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index d07923faa4309f..5078130f180779 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -3514,12 +3514,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x
; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
; CHECK-FAST-PERLANE: # %bb.0:
; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
-; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1]
-; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9]
+; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm0 {%k1}
; CHECK-FAST-PERLANE-NEXT: retq
%vec = load <16 x float>, ptr %vp
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
@@ -3542,11 +3541,10 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8
; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
; CHECK-FAST-PERLANE: # %bb.0:
; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
-; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
-; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9]
+; CHECK-FAST-PERLANE-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm3, %ymm0, %k1
+; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0
; CHECK-FAST-PERLANE-NEXT: retq
%vec = load <16 x float>, ptr %vp
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 6f9b3e94aa68f6..2b89590a0bb419 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -719,10 +719,9 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
+; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,14,9,8,11,15,15,9]
+; AVX512F-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
; AVX512F-NEXT: vzeroupper
More information about the llvm-commits
mailing list