[llvm] 52bc812 - [X86] combineConcatVectorOps - concat(shuffle(x,y,m1),shuffle(x,y,m2)) -> shuffle(concat(x,x),concat(y,y),m3) on VBMI targets (#130134)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 7 01:35:50 PST 2025
Author: Simon Pilgrim
Date: 2025-03-07T09:35:47Z
New Revision: 52bc812a10d14c6c9d704d8a0d52b9a91a9f19bd
URL: https://github.com/llvm/llvm-project/commit/52bc812a10d14c6c9d704d8a0d52b9a91a9f19bd
DIFF: https://github.com/llvm/llvm-project/commit/52bc812a10d14c6c9d704d8a0d52b9a91a9f19bd.diff
LOG: [X86] combineConcatVectorOps - concat(shuffle(x,y,m1),shuffle(x,y,m2)) -> shuffle(concat(x,x),concat(y,y),m3) on VBMI targets (#130134)
With VBMI we are guaranteed to support cross-lane 256-bit shuffles, so subvector splats should always be cheap.
Fixes #116931
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 763a1e959ad27..57a4c6f7a4869 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57936,9 +57936,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
switch (Op0.getOpcode()) {
case ISD::VECTOR_SHUFFLE: {
- if (NumOps == 2 && VT.is256BitVector() &&
+ // TODO: Relax VBMI requirement for repeated shuffle ops - currently
+ // limited to targets that should always have good cross lane shuffles.
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
- (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
+ (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) ||
+ (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
+ Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
+ Subtarget.hasVBMI()))) {
int NumSubElts = Op0.getValueType().getVectorNumElements();
SmallVector<int> NewMask;
for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index d830834a83ab4..a56b0a6351a3b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1566,11 +1566,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
+; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3
+; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %ymm3, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 399bdb7a248b8..f45405d885377 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1218,9 +1218,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLVBMI2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d08bfc6e2d7ea..28d637a50109d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -5059,18 +5059,39 @@ define void @shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
-; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2OR512VL-NEXT: vmovdqa %xmm0, 16(%rdi)
-; AVX2OR512VL-NEXT: vmovdqa %xmm2, (%rdi)
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vmovdqa %xmm0, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VLBW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512VLBW-NEXT: vmovdqa %xmm0, 16(%rdi)
+; AVX512VLBW-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX512VLVBMI-NEXT: vzeroupper
+; AVX512VLVBMI-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
; XOPAVX1: # %bb.0:
More information about the llvm-commits
mailing list