[llvm] [X86] getFauxShuffleMask - relax one use limit for insert_subvector concat splat pattern (PR #127981)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 02:03:57 PST 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/127981
If we're splatting a subvector using a insert_subvector(insert_subvector(undef,sub,0),sub,c) pattern then permit multiuse of the sub as long as the insert_subvector nodes are the only users.
>From 070c586f179182860dba1044148c3b9c1d1b88db Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 20 Feb 2025 10:03:01 +0000
Subject: [PATCH] [X86] getFauxShuffleMask - relax one use limit for
insert_subvector concat splat pattern
If we're splatting a subvector using a insert_subvector(insert_subvector(undef,sub,0),sub,c) pattern then permit multiuse of the sub as long as the insert_subvector nodes are the only users.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 33 ++++++++++---------
.../zero_extend_vector_inreg_of_broadcast.ll | 14 ++++----
2 files changed, 25 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 429e2b42ab5ca..b69674d9be4e4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6128,10 +6128,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
+ uint64_t InsertIdx = N.getConstantOperandVal(2);
+ // Handle CONCAT(SUB0, SUB1).
+ // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
+ if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
+ NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(0).isUndef() &&
+ Src.getOperand(1).getValueType() == SubVT &&
+ Src.getConstantOperandVal(2) == 0 &&
+ SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask.push_back(i);
+ for (int i = 0; i != (int)NumSubElts; ++i)
+ Mask.push_back(i + NumElts);
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
if (!N->isOnlyUserOf(Sub.getNode()))
return false;
SDValue SubBC = peekThroughBitcasts(Sub);
- uint64_t InsertIdx = N.getConstantOperandVal(2);
// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
@@ -6154,21 +6170,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Ops.push_back(SubBCSrc);
return true;
}
- // Handle CONCAT(SUB0, SUB1).
- // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
- if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
- NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Src.getOperand(0).isUndef() &&
- Src.getOperand(1).getValueType() == SubVT &&
- Src.getConstantOperandVal(2) == 0) {
- for (int i = 0; i != (int)NumSubElts; ++i)
- Mask.push_back(i);
- for (int i = 0; i != (int)NumSubElts; ++i)
- Mask.push_back(i + NumElts);
- Ops.push_back(Src.getOperand(1));
- Ops.push_back(Sub);
- return true;
- }
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 758061d456807..29d80e16bb26e 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -6897,9 +6897,10 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,4,21,22,23,0,25,26,27,4,29,30,31]
+; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -7098,9 +7099,10 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15]
+; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
More information about the llvm-commits
mailing list