[llvm] e53196b - [X86] Add support for calling SimplifyDemandedBits on the input of PDEP with a constant mask.

Mon Sep 28 14:35:58 PDT 2020

Author: Craig Topper
Date: 2020-09-28T14:21:30-07:00
New Revision: e53196b1e862902c74d83f0ce6f3578b1326f23d

URL: https://github.com/llvm/llvm-project/commit/e53196b1e862902c74d83f0ce6f3578b1326f23d
DIFF: https://github.com/llvm/llvm-project/commit/e53196b1e862902c74d83f0ce6f3578b1326f23d.diff

LOG: [X86] Add support for calling SimplifyDemandedBits on the input of PDEP with a constant mask.

We can do several optimizations for PDEP using computeKnownBits and SimplifyDemandedBits

-If the MSBs of the output aren't demanded, those MSBs of the mask input aren't demanded either. We need to keep the most significant demanded bit of the mask and any mask bits before it.
-The number of possible ones in the mask determines how many bits of the lsbs of the other operand are demanded. Any bits of the mask we don't demand by the previous rule should not be counted.
-The result will have zeros in any position that the mask is zero.
-Since non-mask input bits can only be output in the original position or a higher bit position, the result will have at least as many trailing zeroes as the non-mask input.

Differential Revision: https://reviews.llvm.org/D87883

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/bmi2-x86_64.ll
    llvm/test/CodeGen/X86/bmi2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 700b96ee9779..7d02eefacb6f 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34053,6 +34053,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::PDEP: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // Zeros are retained from the mask operand. But not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    break;
+  }
   case X86ISD::VTRUNC:
   case X86ISD::VTRUNCS:
   case X86ISD::VTRUNCUS:
@@ -38373,6 +38384,34 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     break;
   }
+  case X86ISD::PDEP: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
+    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+    // If the demanded bits has leading zeroes, we don't demand those from the
+    // mask.
+    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
+      return true;
+
+    // The number of possible 1s in the mask determines the number of LSBs of
+    // operand 0 used. Undemanded bits from the mask don't matter so filter
+    // them before counting.
+    KnownBits Known2;
+    uint64_t Count = (~Known.Zero & LoMask).countPopulation();
+    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+      return true;
+
+    // Zeroes are retained from the mask, but not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    return false;
+  }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -49580,6 +49619,17 @@ static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI) {
+  unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+                               APInt::getAllOnesValue(NumBits), DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -49750,6 +49800,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
   case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
+  case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
   }
 
   return SDValue();

diff  --git a/llvm/test/CodeGen/X86/bmi2-x86_64.ll b/llvm/test/CodeGen/X86/bmi2-x86_64.ll
index 9f8214d5b3b5..22f0a5da9ce2 100644
--- a/llvm/test/CodeGen/X86/bmi2-x86_64.ll
+++ b/llvm/test/CodeGen/X86/bmi2-x86_64.ll
@@ -44,9 +44,9 @@ define i64 @pdep64_load(i64 %x, i64* %y)   {
 define i64 @pdep64_anyext(i32 %x)   {
 ; CHECK-LABEL: pdep64_anyext:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movslq %edi, %rax
-; CHECK-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; CHECK-NEXT:    pdepq %rcx, %rax, %rax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; CHECK-NEXT:    pdepq %rax, %rdi, %rax
 ; CHECK-NEXT:    retq
   %x1 = sext i32 %x to i64
   %tmp = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x1, i64 6148914691236517205)

diff  --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index 637af86dd318..b7a8f2e79fda 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -86,9 +86,8 @@ define i32 @pdep32_anyext(i16 %x)   {
 ;
 ; X64-LABEL: pdep32_anyext:
 ; X64:       # %bb.0:
-; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    movl $-1431655766, %ecx # imm = 0xAAAAAAAA
-; X64-NEXT:    pdepl %ecx, %eax, %eax
+; X64-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X64-NEXT:    pdepl %eax, %edi, %eax
 ; X64-NEXT:    retq
   %x1 = sext i16 %x to i32
   %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x1, i32 -1431655766)
@@ -101,14 +100,12 @@ define i32 @pdep32_demandedbits(i32 %x) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    pdepl %ecx, %eax, %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32_demandedbits:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $1431655765, %eax # imm = 0x55555555
 ; X64-NEXT:    pdepl %eax, %edi, %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X64-NEXT:    retq
   %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 1431655765)
   %tmp2 = and i32 %tmp, 1431655765
@@ -125,8 +122,7 @@ define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
 ;
 ; X64-LABEL: pdep32_demandedbits2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    pdepl %esi, %eax, %eax
+; X64-NEXT:    pdepl %esi, %edi, %eax
 ; X64-NEXT:    andl $128, %eax
 ; X64-NEXT:    retq
   %tmp = and i32 %x, 255
@@ -146,8 +142,7 @@ define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
 ;
 ; X64-LABEL: pdep32_demandedbits_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    movswl %si, %eax
-; X64-NEXT:    pdepl %eax, %edi, %eax
+; X64-NEXT:    pdepl %esi, %edi, %eax
 ; X64-NEXT:    andl $32768, %eax # imm = 0x8000
 ; X64-NEXT:    retq
   %tmp = sext i16 %y to i32
@@ -167,8 +162,7 @@ define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) {
 ;
 ; X64-LABEL: pdep32_demandedbits_mask2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movswl %si, %eax
-; X64-NEXT:    pdepl %eax, %edi, %eax
+; X64-NEXT:    pdepl %esi, %edi, %eax
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    retq
   %tmp = sext i16 %y to i32
@@ -182,19 +176,15 @@ define i32 @pdep32_knownbits(i32 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    pdepl %ecx, %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    pdepl %ecx, %eax, %eax
+; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32_knownbits:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    pdepl %eax, %edi, %ecx
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    pdepl %eax, %edi, %eax
+; X64-NEXT:    imull %eax, %eax
 ; X64-NEXT:    retq
   %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 1431655765)
   %tmp2 = and i32 %tmp, 1431655765
@@ -207,19 +197,15 @@ define i32 @pdep32_knownbits2(i32 %x, i32 %y) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl $-256, %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $-256, %eax
-; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
+; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32_knownbits2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    andl $-256, %edi
-; X64-NEXT:    pdepl %esi, %edi, %ecx
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    andl $-256, %eax
-; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    pdepl %esi, %edi, %eax
+; X64-NEXT:    imull %eax, %eax
 ; X64-NEXT:    retq
   %tmp = and i32 %x, -256
   %tmp2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %tmp, i32 %y)