[llvm] ab4e04a - [X86][AVX] createVariablePermute - generalize the PR50356 fix for smaller indices vector as well
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed May 19 06:40:06 PDT 2021
Author: Simon Pilgrim
Date: 2021-05-19T14:39:41+01:00
New Revision: ab4e04a0f31ae187f67991980e707ff70bbc6285
URL: https://github.com/llvm/llvm-project/commit/ab4e04a0f31ae187f67991980e707ff70bbc6285
DIFF: https://github.com/llvm/llvm-project/commit/ab4e04a0f31ae187f67991980e707ff70bbc6285.diff
LOG: [X86][AVX] createVariablePermute - generalize the PR50356 fix for smaller indices vector as well
Generalize the fix from rGd0902a8665b1 by ensuring we widen/narrow the indices subvector first and then perform the ZERO_EXTEND_VECTOR_INREG (if necessary), which should allow us to perform the variable permutes with source/destination/indices vectors of any widths.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/var-permute-256.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c1588b0b7b5d..4207d2365539 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9978,12 +9978,17 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
"Illegal variable permute mask size");
if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
- if (IndicesVec.getValueSizeInBits() == SizeInBits)
- IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
- IndicesVT, IndicesVec);
- else
+ // Narrow/widen the indices vector to the correct size.
+ if (IndicesVec.getValueSizeInBits() > SizeInBits)
IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
NumElts * VT.getScalarSizeInBits());
+ else if (IndicesVec.getValueSizeInBits() < SizeInBits)
+ IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
+ SDLoc(IndicesVec), SizeInBits);
+ // Zero-extend the index elements within the vector.
+ if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
+ IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
+ IndicesVT, IndicesVec);
}
IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 9902a351b8bf..3327704c04a0 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -1138,6 +1138,10 @@ entry:
ret <4 x i32> %tmp12
}
+;
+; PR50356 - correctly adjust the indices vector to match the source/destination size.
+;
+
define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr nounwind {
; XOP-LABEL: PR50356:
; XOP: # %bb.0:
@@ -1255,3 +1259,76 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr
%v37 = select <4 x i1> %v36, <4 x i64> <i64 17, i64 51, i64 85, i64 119>, <4 x i64> <i64 34, i64 68, i64 102, i64 136> ; 17 68 102 136
ret <4 x i64> %v37
}
+
+define <4 x i64> @var_shuffle_v4i64_with_v16i8_indices(<4 x i64> %v, <16 x i8> %indices) unnamed_addr nounwind {
+; XOP-LABEL: var_shuffle_v4i64_with_v16i8_indices:
+; XOP: # %bb.0:
+; XOP-NEXT: vpsrld $16, %xmm1, %xmm2
+; XOP-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; XOP-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddq %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT: retq
+;
+; AVX1-LABEL: var_shuffle_v4i64_with_v16i8_indices:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; AVX1-NEXT: vpermilpd %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq {{\.LCPI[0-9]+_[0-9]+}}+{{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64_with_v16i8_indices:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: var_shuffle_v4i64_with_v16i8_indices:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4i64_with_v16i8_indices:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+ %index0 = extractelement <16 x i8> %indices, i32 0
+ %index1 = extractelement <16 x i8> %indices, i32 1
+ %index2 = extractelement <16 x i8> %indices, i32 2
+ %index3 = extractelement <16 x i8> %indices, i32 3
+ %v0 = extractelement <4 x i64> %v, i8 %index0
+ %v1 = extractelement <4 x i64> %v, i8 %index1
+ %v2 = extractelement <4 x i64> %v, i8 %index2
+ %v3 = extractelement <4 x i64> %v, i8 %index3
+ %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
+ %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
+ %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
+ %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
+ ret <4 x i64> %ret3
+}
More information about the llvm-commits
mailing list