[llvm] r263906 - [X86][SSE] Detect zeroable shuffle elements from different value types
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 20 08:45:42 PDT 2016
Author: rksimon
Date: Sun Mar 20 10:45:42 2016
New Revision: 263906
URL: http://llvm.org/viewvc/llvm-project?rev=263906&view=rev
Log:
[X86][SSE] Detect zeroable shuffle elements from different value types
Improve computeZeroableShuffleElements to be able to peek through bitcasts to extract zero/undef values from BUILD_VECTOR nodes of different element sizes to the shuffle mask.
Differential Revision: http://reviews.llvm.org/D14261
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/insertps-combine.ll
llvm/trunk/test/CodeGen/X86/widen_load-2.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=263906&r1=263905&r2=263906&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Mar 20 10:45:42 2016
@@ -7257,6 +7257,10 @@ static SmallBitVector computeZeroableShu
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+ int VectorSizeInBits = V1.getValueType().getSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Mask.size();
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
// Handle the easy cases.
@@ -7265,17 +7269,47 @@ static SmallBitVector computeZeroableShu
continue;
}
- // If this is an index into a build_vector node (which has the same number
- // of elements), dig out the input value and use it.
+ // Determine shuffle input and normalize the mask.
SDValue V = M < Size ? V1 : V2;
- if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+ M %= Size;
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
continue;
- SDValue Input = V.getOperand(M % Size);
- // The UNDEF opcode check really should be dead code here, but not quite
- // worth asserting on (it isn't invalid, just unexpected).
- if (Input.isUndef() || X86::isZeroNode(Input))
- Zeroable[i] = true;
+ // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+ // the (larger) source element must be UNDEF/ZERO.
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef() || X86::isZeroNode(Op))
+ Zeroable[i] = true;
+ else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt Val = Cst->getAPIntValue();
+ Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val = Val.getLoBits(ScalarSizeInBits);
+ Zeroable[i] = (Val == 0);
+ } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt Val = Cst->getValueAPF().bitcastToAPInt();
+ Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val = Val.getLoBits(ScalarSizeInBits);
+ Zeroable[i] = (Val == 0);
+ }
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be UNDEF or ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllZeroable = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
+ }
+ Zeroable[i] = AllZeroable;
+ continue;
+ }
}
return Zeroable;
Modified: llvm/trunk/test/CodeGen/X86/insertps-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/insertps-combine.ll?rev=263906&r1=263905&r2=263906&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/insertps-combine.ll (original)
+++ llvm/trunk/test/CodeGen/X86/insertps-combine.ll Sun Mar 20 10:45:42 2016
@@ -135,22 +135,18 @@ define <4 x float> @insertps_undef_input
define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1) nounwind {
; SSE-LABEL: insertps_zero_from_v2f64:
; SSE: # BB#0:
-; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00]
-; SSE-NEXT: movapd (%rdi), %xmm2
-; SSE-NEXT: addpd %xmm1, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; SSE-NEXT: movapd %xmm2, (%rdi)
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: addpd {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movapd %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_zero_from_v2f64:
; AVX: # BB#0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00]
-; AVX-NEXT: vaddpd (%rdi), %xmm1, %xmm2
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,3]
-; AVX-NEXT: vmovapd %xmm2, (%rdi)
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovapd %xmm1, (%rdi)
; AVX-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a1
%2 = bitcast <2 x double> <double 1.0, double 2.0> to <4 x float>
@@ -163,27 +159,23 @@ define <4 x float> @insertps_zero_from_v
define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) nounwind {
; SSE-LABEL: insertps_zero_from_v2i64:
; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,18446744073709551614]
-; SSE-NEXT: movdqa (%rdi), %xmm2
-; SSE-NEXT: paddq %xmm1, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; SSE-NEXT: movdqa %xmm2, (%rdi)
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movdqa %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_zero_from_v2i64:
; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,18446744073709551614]
-; AVX-NEXT: vpaddq (%rdi), %xmm1, %xmm2
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,3]
-; AVX-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovdqa %xmm1, (%rdi)
; AVX-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %a1
%2 = bitcast <2 x i64> <i64 1, i64 -2> to <4 x float>
%3 = add <2 x i64> %1, <i64 1, i64 -2>
- %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 6, i32 2, i32 2, i32 3>
+ %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 5, i32 2, i32 2, i32 3>
store <2 x i64> %3, <2 x i64> *%a1
ret <4 x float> %4
}
@@ -191,21 +183,18 @@ define <4 x float> @insertps_zero_from_v
define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) nounwind {
; SSE-LABEL: insertps_zero_from_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,1,2,2,3,3]
-; SSE-NEXT: movdqa (%rdi), %xmm2
-; SSE-NEXT: paddw %xmm1, %xmm2
-; SSE-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: movdqa %xmm2, (%rdi)
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movdqa %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_zero_from_v8i16:
; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,1,1,2,2,3,3]
-; AVX-NEXT: vpaddw (%rdi), %xmm1, %xmm2
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovdqa %xmm1, (%rdi)
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a1
%2 = bitcast <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3> to <4 x float>
Modified: llvm/trunk/test/CodeGen/X86/widen_load-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_load-2.ll?rev=263906&r1=263905&r2=263906&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_load-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_load-2.ll Sun Mar 20 10:45:42 2016
@@ -210,26 +210,26 @@ define void @add31i8(%i8vec31* nocapture
define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
; CHECK-LABEL: rot:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <158,158,158,u>
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: pshufb %xmm1, %xmm0
-; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u>
+; CHECK-NEXT: pshufb %xmm0, %xmm1
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm1, %eax
; CHECK-NEXT: movw %ax, (%rsi)
; CHECK-NEXT: movb $-98, 2(%rsi)
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,u>
-; CHECK-NEXT: pshufb %xmm1, %xmm0
-; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u>
+; CHECK-NEXT: pshufb %xmm0, %xmm1
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: movw %ax, (%rdx)
; CHECK-NEXT: movb $1, 2(%rdx)
; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psrld $1, %xmm2
-; CHECK-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-NEXT: pextrb $8, %xmm2, 2(%rdi)
-; CHECK-NEXT: pshufb %xmm1, %xmm2
-; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $1, %xmm1
+; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi)
+; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: movw %ax, (%rdi)
; CHECK-NEXT: movq %rdi, %rax
More information about the llvm-commits
mailing list