[llvm] 6865cff - [X86] combineMOVMSK - fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2)) iff pow2splat(c1)

Mon Apr 3 07:11:27 PDT 2023

Author: Simon Pilgrim
Date: 2023-04-03T15:11:13+01:00
New Revision: 6865cff8ea8b07d9f2385fd92cecb422404f0f35

URL: https://github.com/llvm/llvm-project/commit/6865cff8ea8b07d9f2385fd92cecb422404f0f35
DIFF: https://github.com/llvm/llvm-project/commit/6865cff8ea8b07d9f2385fd92cecb422404f0f35.diff

LOG: [X86] combineMOVMSK - fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2)) iff pow2splat(c1)

We already have a similar fold for movmsk(icmp_eq(and(x,c1),0)) which we can probably merge this with, but it will involve generalizing a lot of the knownbits code

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/bitcast-vector-bool.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f66a6f9c499b..13a59ab7fb0e 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54537,6 +54537,32 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
                        DAG.getConstant(NotMask, DL, VT));
   }
 
+  // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
+  // iff pow2splat(c1).
+  // Use KnownBits to determine if only a single bit is non-zero
+  // in each element (pow2 or zero), and shift that bit to the msb.
+  // TODO: Merge with the movmsk(icmp_eq(and(x,c1),0)) fold below?
+  if (Src.getOpcode() == X86ISD::PCMPEQ &&
+      Src.getOperand(0).getOpcode() == ISD::AND &&
+      Src.getOperand(1) == Src.getOperand(0).getOperand(1)) {
+    KnownBits KnownSrc = DAG.computeKnownBits(Src.getOperand(1));
+    if (KnownSrc.countMaxPopulation() == 1) {
+      SDLoc DL(N);
+      MVT ShiftVT = SrcVT;
+      SDValue ShiftSrc = Src.getOperand(0);
+      if (ShiftVT.getScalarType() == MVT::i8) {
+        // vXi8 shifts - we only care about the signbit so can use PSLLW.
+        ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+        ShiftSrc = DAG.getBitcast(ShiftVT, ShiftSrc);
+      }
+      unsigned ShiftAmt = KnownSrc.countMinLeadingZeros();
+      ShiftSrc = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
+                                            ShiftSrc, ShiftAmt, DAG);
+      ShiftSrc = DAG.getBitcast(SrcVT, ShiftSrc);
+      return DAG.getNode(X86ISD::MOVMSK, DL, VT, ShiftSrc);
+    }
+  }
+
   // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
   // iff pow2splat(c1).
   // Use KnownBits to determine if only a single bit is non-zero

diff  --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index d41e783780a8..7477044c86a7 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -109,9 +109,7 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind {
 define i1 @trunc_v4i32_cmp(<4 x i32> %a0) nounwind {
 ; SSE2-SSSE3-LABEL: trunc_v4i32_cmp:
 ; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1]
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pslld $31, %xmm0
 ; SSE2-SSSE3-NEXT:    movmskps %xmm0, %eax
 ; SSE2-SSSE3-NEXT:    xorl $15, %eax
 ; SSE2-SSSE3-NEXT:    sete %al
@@ -263,9 +261,7 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
 define i1 @trunc_v16i8_cmp(<16 x i8> %a0) nounwind {
 ; SSE2-SSSE3-LABEL: trunc_v16i8_cmp:
 ; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    psllw $7, %xmm0
 ; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-SSSE3-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-SSSE3-NEXT:    setne %al
@@ -402,9 +398,7 @@ define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind {
 ; SSE2-SSSE3-LABEL: trunc_v8i132_cmp:
 ; SSE2-SSSE3:       # %bb.0:
 ; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1]
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pslld $31, %xmm0
 ; SSE2-SSSE3-NEXT:    movmskps %xmm0, %eax
 ; SSE2-SSSE3-NEXT:    xorl $15, %eax
 ; SSE2-SSSE3-NEXT:    setne %al
@@ -588,9 +582,7 @@ define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind {
 ; SSE2-SSSE3-LABEL: trunc_v32i8_cmp:
 ; SSE2-SSSE3:       # %bb.0:
 ; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    psllw $7, %xmm0
 ; SSE2-SSSE3-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-SSSE3-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-SSSE3-NEXT:    sete %al