[llvm] 1e7865d - [X86] SimplifyMultipleUseDemandedBitsForTargetNode - add initial X86ISD::VSRAI handling.

Sun May 24 08:15:55 PDT 2020

Author: Simon Pilgrim
Date: 2020-05-24T16:07:46+01:00
New Revision: 1e7865d94647119aab3c1b2e8ce259ca1634bc64

URL: https://github.com/llvm/llvm-project/commit/1e7865d94647119aab3c1b2e8ce259ca1634bc64
DIFF: https://github.com/llvm/llvm-project/commit/1e7865d94647119aab3c1b2e8ce259ca1634bc64.diff

LOG: [X86] SimplifyMultipleUseDemandedBitsForTargetNode - add initial X86ISD::VSRAI handling.

This initial version only peeks through cases where we just demand the sign bit of an ashr shift, but we could generalize this further depending on how many sign bits we already have.

The pr18014.ll case is a minor annoyance - we've failed to to move the psrad/paddd after the blendvps which would have avoided the extra move, but we have still increased the ILP.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx2-masked-gather.ll
    llvm/test/CodeGen/X86/movmsk-cmp.ll
    llvm/test/CodeGen/X86/pr18014.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ceeee364d07f..f56a11c0e8ee 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37404,6 +37404,12 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
       return Vec;
      break;
   }
+  case X86ISD::VSRAI:
+    // iff we only need the sign bit then we can use the source directly.
+    // TODO: generalize where we only demand extended signbits.
+    if (DemandedBits.isSignMask())
+      return Op.getOperand(0);
+    break;
   case X86ISD::PCMPGT:
     // icmp sgt(0, R) == ashr(R, BitWidth-1).
     // iff we only need the sign bit then we can use R directly.

diff  --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
index 1d4e6a552214..62326113ed0d 100644
--- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll
+++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
@@ -347,7 +347,6 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    vpslld $31, %ymm0, %ymm0
-; X64-NEXT:    vpsrad $31, %ymm0, %ymm0
 ; X64-NEXT:    vmovdqa (%rdi), %ymm2
 ; X64-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; X64-NEXT:    vextracti128 $1, %ymm1, %xmm4
@@ -462,7 +461,6 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    vpslld $31, %ymm0, %ymm0
-; X64-NEXT:    vpsrad $31, %ymm0, %ymm0
 ; X64-NEXT:    vmovaps (%rdi), %ymm2
 ; X64-NEXT:    vmovaps 32(%rdi), %ymm3
 ; X64-NEXT:    vextractf128 $1, %ymm1, %xmm4

diff  --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 9bde9b454eaa..fa6d94991013 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -1602,7 +1602,6 @@ define i1 @allones_v16i16_and1(<16 x i16> %arg) {
 ; AVX2-LABEL: allones_v16i16_and1:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllw $15, %ymm0, %ymm0
-; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovmskb %xmm0, %eax
@@ -1823,7 +1822,6 @@ define i1 @allzeros_v16i16_and1(<16 x i16> %arg) {
 ; AVX2-LABEL: allzeros_v16i16_and1:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllw $15, %ymm0, %ymm0
-; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovmskb %xmm0, %eax
@@ -3005,7 +3003,6 @@ define i1 @allones_v16i16_and4(<16 x i16> %arg) {
 ; AVX2-LABEL: allones_v16i16_and4:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllw $13, %ymm0, %ymm0
-; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovmskb %xmm0, %eax
@@ -3226,7 +3223,6 @@ define i1 @allzeros_v16i16_and4(<16 x i16> %arg) {
 ; AVX2-LABEL: allzeros_v16i16_and4:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllw $13, %ymm0, %ymm0
-; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovmskb %xmm0, %eax

diff  --git a/llvm/test/CodeGen/X86/pr18014.ll b/llvm/test/CodeGen/X86/pr18014.ll
index fed68e86dfbc..f0bc6f24c699 100644
--- a/llvm/test/CodeGen/X86/pr18014.ll
+++ b/llvm/test/CodeGen/X86/pr18014.ll
@@ -2,17 +2,18 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s
 
 ; Ensure PSRAD is generated as the condition is consumed by both PADD and
-; BLENDVPS. PAND requires all bits setting properly.
+; BLENDVPS. PADD requires all bits setting properly.
 
 define <4 x i32> @foo(<4 x i32>* %p, <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pslld $31, %xmm0
-; CHECK-NEXT:    psrad $31, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm3
+; CHECK-NEXT:    psrad $31, %xmm3
+; CHECK-NEXT:    paddd %xmm1, %xmm3
 ; CHECK-NEXT:    blendvps %xmm0, %xmm1, %xmm2
-; CHECK-NEXT:    paddd %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm2, (%rdi)
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %sext_cond = sext <4 x i1> %cond to <4 x i32>
   %t1 = add <4 x i32> %v1, %sext_cond