[llvm] [X86] combineConcatVectorOps - add concatenation handling for BITCAST + AssertSext/AssertZext nodes (PR #133913)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 9 11:18:28 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
These nodes are effectively free, so we should only concatenate if the inner nodes will concatenate together.
This also exposed a regression in canonicalizeShuffleWithOp that failed to realize it could potentially merge shuffles with a CONCAT_VECTORS node.
---
Patch is 595.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133913.diff
13 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+35)
- (modified) llvm/test/CodeGen/X86/shift-i512.ll (+18-22)
- (modified) llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll (+8-10)
- (modified) llvm/test/CodeGen/X86/vector-fshr-256.ll (+31-17)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-256.ll (+13-12)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll (+16-14)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+32-32)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll (+44-44)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+210-206)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+300-296)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+2600-2744)
- (modified) llvm/test/CodeGen/X86/vector-reduce-ctpop.ll (+37-42)
- (modified) llvm/test/CodeGen/X86/widen_fdiv.ll (+15-29)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a3c423270f44a..1a771ecd651ea 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41704,6 +41704,7 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
(Op.getOpcode() == Opc && Op->hasOneUse()) ||
(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
+ (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
(FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
DAG.isSplatValue(Op, /*AllowUndefs*/ false);
};
@@ -58134,6 +58135,40 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
unsigned Opcode = Op0.getOpcode();
switch (Opcode) {
+ case ISD::AssertSext:
+ case ISD::AssertZext: {
+ if (!IsSplat && llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ }))
+ if (SDValue ConcatSrc =
+ combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget, Depth + 1))
+ return DAG.getNode(Opcode, DL, VT, ConcatSrc, Op0.getOperand(1));
+ break;
+ }
+ case ISD::BITCAST: {
+ // TODO: Support AVX1/AVX2 bitcasts.
+ SmallVector<SDValue, 4> SubOps;
+ for (SDValue SubOp : Ops)
+ SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
+ EVT InnerVT = SubOps[0].getValueType();
+ unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
+ if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
+ (Subtarget.hasBWI() ||
+ (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
+ ((VT.is256BitVector() && Subtarget.hasVLX()) ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
+ llvm::all_of(SubOps, [InnerVT](SDValue Op) {
+ return Op.getValueType() == InnerVT;
+ })) {
+ MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
+ MVT ConcatVT = MVT::getVectorVT(
+ ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
+ if (SDValue ConcatSrc = combineConcatVectorOps(
+ DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
+ return DAG.getBitcast(VT, ConcatSrc);
+ }
+ break;
+ }
case ISD::VECTOR_SHUFFLE: {
// TODO: Generalize NumOps support.
if (!IsSplat && NumOps == 2 &&
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f7dd1dc0949f5..c7da04171e6a1 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -123,20 +123,18 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
;
; AVX512VBMI-LABEL: lshr_i512_1:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm5, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm1, %xmm2
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512VBMI-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512VBMI-NEXT: vpsrlq $1, %xmm2, %xmm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -238,20 +236,18 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
;
; AVX512VBMI-LABEL: ashr_i512_1:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm5, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm1, %xmm2
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512VBMI-NEXT: vpsraq $1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512VBMI-NEXT: vpsraq $1, %xmm2, %xmm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index ec442c185706c..e27a77ed2293d 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -297,23 +297,21 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
;
; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,62,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,5]
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
-; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 35c707eac83b4..0fa2c858ff000 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1228,13 +1228,14 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94]
-; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
-; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpermt2b %zmm4, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
+; AVX512VBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
+; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
@@ -1251,16 +1252,29 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
-; AVX10-LABEL: splatvar_funnnel_v32i8:
-; AVX10: # %bb.0:
-; AVX10-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX10-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX10-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10-NEXT: vpsrlw %xmm2, %ymm0, %ymm1
-; AVX10-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; AVX10-NEXT: vpermi2b %ymm3, %ymm1, %ymm0
-; AVX10-NEXT: retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95,0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87]
+; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,34,36,38,40,42,44,46,0,2,4,6,8,10,12,14,48,50,52,54,56,58,60,62,16,18,20,22,24,26,28,30]
+; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VLVBMI2-NEXT: retq
+;
+; AVX10_256-LABEL: splatvar_funnnel_v32i8:
+; AVX10_256: # %bb.0:
+; AVX10_256-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX10_256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX10_256-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
+; AVX10_256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX10_256-NEXT: vpsrlw %xmm2, %ymm0, %ymm1
+; AVX10_256-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
+; AVX10_256-NEXT: vpermi2b %ymm3, %ymm1, %ymm0
+; AVX10_256-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 5cac190eae690..3d4f283260aa5 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -992,25 +992,26 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94]
-; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm3, %zmm0
; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpermt2b %zmm3, %zmm2, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [32,34,36,38,40,42,44,46,0,2,4,6,8,10,12,14,48,50,52,54,56,58,60,62,16,18,20,22,24,26,28,30]
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm3, %zmm0
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm1
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; AVX512VLVBMI2-NEXT: vpermi2b %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index fc4377a08d560..3e04c2c8120cc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -130,13 +130,14 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512BW-FCP-LABEL: store_i16_stride4_vf2:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
-; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i16_stride4_vf2:
@@ -152,13 +153,14 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-BW-FCP-LABEL: store_i16_stride4_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
%in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 7b619344e83f6..3767f1d68dfb7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7....
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/133913
More information about the llvm-commits
mailing list