[llvm] [X86] SimplifyDemandedVectorEltsForTargetNode - handle 512-bit X86ISD::VPERMI with lower half demanded elts (PR #137139)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 24 01:35:05 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/137139
512-bit X86ISD::VPERMI nodes handle the lower/upper 256-bits separately - so if we don't demand the upper half elements, we can just use the 256-bit variant.
>From 4c07cd67de1f334507845d9e3e0b2b9f1bbd989f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 24 Apr 2025 09:30:43 +0100
Subject: [PATCH] [X86] SimplifyDemandedVectorEltsForTargetNode - handle
512-bit X86ISD::VPERMI with lower half demanded elts
512-bit X86ISD::VPERMI nodes handle the lower/upper 256-bits separately - so if we don't demand the upper half elements, we can just use the 256-bit variant.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +++++++-
.../X86/avx512-shuffles/partial_permute.ll | 12 ++++----
.../vector-interleaved-load-i64-stride-3.ll | 28 ++++++++-----------
3 files changed, 29 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dfaf58e753fb7..4a9121baba7db 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43786,7 +43786,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return TLO.CombineTo(Op, Insert);
}
case X86ISD::VPERMI: {
- // Simplify PERMPD/PERMQ to extract_subvector.
+ // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
// TODO: This should be done in shuffle combining.
if (VT == MVT::v4f64 || VT == MVT::v4i64) {
SmallVector<int, 4> Mask;
@@ -43799,6 +43799,16 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return TLO.CombineTo(Op, Insert);
}
}
+ // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
+ if (VT == MVT::v8f64 || VT == MVT::v8i64) {
+ SDLoc DL(Op);
+ SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
+ Op.getOperand(1));
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
+ return TLO.CombineTo(Op, Insert);
+ }
break;
}
case X86ISD::VPERMV: {
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index a84424bf7dea9..b3bf464b529d0 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -2283,8 +2283,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i
define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2293,9 +2293,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,2,3,7,4,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,3]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -2308,8 +2308,8 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 93a84e30412d6..c8b95cd71c5d1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -84,7 +84,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i64_stride3_vf2:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -97,9 +97,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-LABEL: load_i64_stride3_vf2:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
+; AVX512-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
; AVX512-FCP-NEXT: vmovaps 16(%rdi), %xmm2
; AVX512-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX512-FCP-NEXT: vmovaps %xmm1, (%rsi)
@@ -110,7 +109,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-LABEL: load_i64_stride3_vf2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
+; AVX512DQ-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -123,9 +122,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-LABEL: load_i64_stride3_vf2:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
; AVX512DQ-FCP-NEXT: vmovaps 16(%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rsi)
@@ -136,7 +134,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-LABEL: load_i64_stride3_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
+; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -149,9 +147,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-LABEL: load_i64_stride3_vf2:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
+; AVX512BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
; AVX512BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
; AVX512BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rsi)
@@ -162,7 +159,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-LABEL: load_i64_stride3_vf2:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
+; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,3,2,3]
; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
@@ -175,9 +172,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-LABEL: load_i64_stride3_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rsi)
More information about the llvm-commits
mailing list