[llvm] r234193 - [X86][SSE] Use (V)PINSRB for direct byte insertion in 16i8 buildvector on SSE4.1 targets
Simon Pilgrim
llvm-dev at redking.me.uk
Mon Apr 6 11:39:00 PDT 2015
Author: rksimon
Date: Mon Apr 6 13:39:00 2015
New Revision: 234193
URL: http://llvm.org/viewvc/llvm-project?rev=234193&view=rev
Log:
[X86][SSE] Use (V)PINSRB for direct byte insertion in 16i8 buildvector on SSE4.1 targets
This patch allows SSE4.1 targets to use (V)PINSRB to create 16i8 vectors by inserting i8 scalars directly into a XMM register instead of merging pairs of i8 scalars into a i16 and using the SSE2 PINSRW instruction.
This allows folding of byte loads and reduces scalar register usage as well.
Differential Revision: http://reviews.llvm.org/D8839
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vec_cast2.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=234193&r1=234192&r2=234193&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Apr 6 13:39:00 2015
@@ -4460,6 +4460,29 @@ static SDValue LowerBuildVectorv16i8(SDV
SDLoc dl(Op);
SDValue V;
bool First = true;
+
+ // SSE4.1 - use PINSRB to insert each byte directly.
+ if (Subtarget->hasSSE41()) {
+ for (unsigned i = 0; i < 16; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v16i8);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v16i8, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i));
+ }
+ }
+
+ return V;
+ }
+
+ // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {
Modified: llvm/trunk/test/CodeGen/X86/vec_cast2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_cast2.ll?rev=234193&r1=234192&r2=234193&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_cast2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_cast2.ll Mon Apr 6 13:39:00 2015
@@ -100,37 +100,29 @@ define <8 x i8> @foo3_8(<8 x float> %src
;
; CHECK-WIDE-LABEL: foo3_8:
; CHECK-WIDE: ## BB#0:
-; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT: shll $8, %eax
-; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
-; CHECK-WIDE-NEXT: movzbl %cl, %ecx
-; CHECK-WIDE-NEXT: orl %eax, %ecx
-; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT: shll $8, %eax
-; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
-; CHECK-WIDE-NEXT: movzbl %dl, %edx
-; CHECK-WIDE-NEXT: orl %eax, %edx
-; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1
-; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT: shll $8, %eax
-; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT: movzbl %cl, %ecx
-; CHECK-WIDE-NEXT: orl %eax, %ecx
-; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT: shll $8, %eax
-; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT: movzbl %cl, %ecx
-; CHECK-WIDE-NEXT: orl %eax, %ecx
-; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: vzeroupper
; CHECK-WIDE-NEXT: retl
%res = fptosi <8 x float> %src to <8 x i8>
@@ -145,21 +137,17 @@ define <4 x i8> @foo3_4(<4 x float> %src
;
; CHECK-WIDE-LABEL: foo3_4:
; CHECK-WIDE: ## BB#0:
-; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT: shll $8, %eax
-; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx
-; CHECK-WIDE-NEXT: movzbl %cl, %ecx
-; CHECK-WIDE-NEXT: orl %eax, %ecx
-; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT: shll $8, %eax
-; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx
-; CHECK-WIDE-NEXT: movzbl %dl, %edx
-; CHECK-WIDE-NEXT: orl %eax, %edx
-; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i8>
ret <4 x i8> %res
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll?rev=234193&r1=234192&r2=234193&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll Mon Apr 6 13:39:00 2015
@@ -651,18 +651,30 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz
}
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
-; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE: # BB#0:
-; SSE-NEXT: shll $8, %edi
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pinsrw $2, %edi, %xmm0
-; SSE-NEXT: retq
-
+; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: shll $8, %edi
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pinsrw $2, %edi, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shll $8, %edi
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pinsrb $5, %edi, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
-; AVX-NEXT: shll $8, %edi
; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $2, %edi, %xmm0
+; AVX-NEXT: vpinsrb $5, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -670,18 +682,30 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz
}
define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
-; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSE: # BB#0:
-; SSE-NEXT: shll $8, %edi
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pinsrw $7, %edi, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE2: # BB#0:
+; SSE2-NEXT: shll $8, %edi
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pinsrw $7, %edi, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shll $8, %edi
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pinsrb $15, %edi, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; AVX: # BB#0:
-; AVX-NEXT: shll $8, %edi
; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, %edi, %xmm0
+; AVX-NEXT: vpinsrb $15, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
@@ -689,18 +713,30 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu
}
define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
-; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE: # BB#0:
-; SSE-NEXT: movzbl %dil, %eax
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pinsrw $1, %eax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2: # BB#0:
+; SSE2-NEXT: movzbl %dil, %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pinsrw $1, %eax, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movzbl %dil, %eax
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pinsrb $2, %edi, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
-; AVX-NEXT: movzbl %dil, %eax
; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $1, %eax, %xmm0
+; AVX-NEXT: vpinsrb $2, %edi, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 3
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
More information about the llvm-commits
mailing list