[llvm] ad3d6f9 - [SelectionDAG][X86][ARM][AArch64] Add ISD opcode for __builtin_parity. Expand it to shifts and xors.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Sep 12 11:43:16 PDT 2020


Author: Craig Topper
Date: 2020-09-12T11:42:18-07:00
New Revision: ad3d6f993d9f7ff3a54c5a716ccc918026fa0252

URL: https://github.com/llvm/llvm-project/commit/ad3d6f993d9f7ff3a54c5a716ccc918026fa0252
DIFF: https://github.com/llvm/llvm-project/commit/ad3d6f993d9f7ff3a54c5a716ccc918026fa0252.diff

LOG: [SelectionDAG][X86][ARM][AArch64] Add ISD opcode for __builtin_parity. Expand it to shifts and xors.

Clang emits (and (ctpop X), 1) for __builtin_parity. If ctpop
isn't natively supported by the target, this leads to poor codegen
due to the expansion of ctpop being more complex than what is needed
for parity.

This adds a DAG combine to convert the pattern to ISD::PARITY
before operation legalization. Type legalization is updated
to handled Expanding and Promoting this operation. If after type
legalization, CTPOP is supported for this type, LegalizeDAG will
turn it back into CTPOP+AND. Otherwise LegalizeDAG will emit a
series of shifts and xors followed by an AND with 1.

I've avoided vectors in this patch to avoid more legalization
complexity for this patch.

X86 previously had a custom DAG combiner for this. This is now
moved to Custom lowering for the new opcode. There is a minor
regression in vector-reduce-xor-bool.ll, but a follow up patch
can easily fix that.

Fixes PR47433

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D87209

Added: 
    llvm/test/CodeGen/AArch64/parity.ll
    llvm/test/CodeGen/ARM/parity.ll

Modified: 
    llvm/include/llvm/CodeGen/ISDOpcodes.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
    llvm/lib/CodeGen/TargetLoweringBase.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/parity.ll
    llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ae08d6e9313d..ba5a5d6e8751 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -598,6 +598,7 @@ enum NodeType {
   CTLZ,
   CTPOP,
   BITREVERSE,
+  PARITY,
 
   /// Bit counting operators with an undefined result for zero inputs.
   CTTZ_ZERO_UNDEF,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eaa70444578a..3aaf5e01d26a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5574,6 +5574,25 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
       return V;
 
+  // fold (and (ctpop X), 1) -> parity X
+  // Only do this before op legalization as it might be turned back into ctpop.
+  // TODO: Support vectors?
+  if (!LegalOperations && isOneConstant(N1) && N0.hasOneUse()) {
+    SDValue Tmp = N0;
+
+    // It's possible the ctpop has been truncated, but since we only care about
+    // the LSB we can look through it.
+    if (Tmp.getOpcode() == ISD::TRUNCATE && Tmp.getOperand(0).hasOneUse())
+      Tmp = Tmp.getOperand(0);
+
+    if (Tmp.getOpcode() == ISD::CTPOP) {
+      SDLoc dl(N);
+      SDValue Parity =
+          DAG.getNode(ISD::PARITY, dl, Tmp.getValueType(), Tmp.getOperand(0));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Parity);
+    }
+  }
+
   return SDValue();
 }
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7751ebb7705a..71ba228b53f6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -181,6 +181,7 @@ class SelectionDAGLegalize {
 
   SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
   SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
+  SDValue ExpandPARITY(SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@@ -2785,6 +2786,28 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
   }
 }
 
+/// Open code the operations for PARITY of the specified operation.
+SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  // If CTPOP is legal, use it. Otherwise use shifts and xor.
+  SDValue Result;
+  if (TLI.isOperationLegal(ISD::CTPOP, VT)) {
+    Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  } else {
+    Result = Op;
+    for (unsigned i = Log2_32_Ceil(Sz); i != 0;) {
+      SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, Result,
+                                  DAG.getConstant(1 << (--i), dl, ShVT));
+      Result = DAG.getNode(ISD::XOR, dl, VT, Result, Shift);
+    }
+  }
+
+  return DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(1, dl, VT));
+}
+
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
@@ -2816,6 +2839,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::BSWAP:
     Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
     break;
+  case ISD::PARITY:
+    Results.push_back(ExpandPARITY(Node->getOperand(0), dl));
+    break;
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::FRAME_TO_ARGS_OFFSET:

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index bfe1b365efc4..0000fcb1dde1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -62,7 +62,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
-  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::PARITY:
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP_PARITY(N); break;
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
   case ISD::EXTRACT_VECTOR_ELT:
@@ -503,10 +504,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
                       NVT));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
-  // Zero extend to the promoted type and do the count there.
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
+  // Zero extend to the promoted type and do the count or parity there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
@@ -1980,6 +1981,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
   case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
   case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::PARITY:      ExpandIntRes_PARITY(N, Lo, Hi); break;
   case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
   case ISD::ABS:         ExpandIntRes_ABS(N, Lo, Hi); break;
   case ISD::CTLZ_ZERO_UNDEF:
@@ -2772,6 +2774,17 @@ void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
   Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_PARITY(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  // parity(HiLo) -> parity(Lo^Hi)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo =
+      DAG.getNode(ISD::PARITY, dl, NVT, DAG.getNode(ISD::XOR, dl, NVT, Lo, Hi));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 34c563672753..86f4fcc023dd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -311,7 +311,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
   SDValue PromoteIntRes_Constant(SDNode *N);
   SDValue PromoteIntRes_CTLZ(SDNode *N);
-  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTPOP_PARITY(SDNode *N);
   SDValue PromoteIntRes_CTTZ(SDNode *N);
   SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
@@ -431,6 +431,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_ADDSUBCARRY       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fcd09b614167..f854a4f4d35f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -412,6 +412,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
   case ISD::CTLZ:                       return "ctlz";
   case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+  case ISD::PARITY:                     return "parity";
 
   // Trampolines
   case ISD::INIT_TRAMPOLINE:            return "init_trampoline";

diff  --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 958bb7939046..7ef37db68a28 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -692,6 +692,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 
     setOperationAction(ISD::BITREVERSE, VT, Expand);
+    setOperationAction(ISD::PARITY, VT, Expand);
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8913dff47df4..5f7721267db0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -385,6 +385,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
+  setOperationAction(ISD::PARITY, MVT::i8, Custom);
   if (Subtarget.hasPOPCNT()) {
     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
   } else {
@@ -395,6 +396,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
     else
       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
+
+    setOperationAction(ISD::PARITY, MVT::i16, Custom);
+    setOperationAction(ISD::PARITY, MVT::i32, Custom);
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::PARITY, MVT::i64, Custom);
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
@@ -28865,6 +28871,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
 }
 
+static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  MVT VT = Op.getSimpleValueType();
+
+  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
+  if (VT == MVT::i8 ||
+      DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
+    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
+                                DAG.getConstant(0, DL, MVT::i8));
+    // Copy the inverse of the parity flag into a register with setcc.
+    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+    // Extend to the original type.
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+  }
+
+  if (VT == MVT::i64) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+                             DAG.getNode(ISD::SRL, DL, MVT::i64, X,
+                                         DAG.getConstant(32, DL, MVT::i8)));
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+  }
+
+  if (VT != MVT::i16) {
+    // Xor the high and low 16-bits together using a 32-bit operation.
+    SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
+                               DAG.getConstant(16, DL, MVT::i8));
+    X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
+  } else {
+    // If the input is 16-bits, we need to extend to use an i32 shift below.
+    X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
+  }
+
+  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+  // This should allow an h-reg to be used to save a shift.
+  SDValue Hi = DAG.getNode(
+      ISD::TRUNCATE, DL, MVT::i8,
+      DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+  // Copy the inverse of the parity flag into a register with setcc.
+  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+  // Extend to the original type.
+  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+}
+
 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   unsigned NewOpc = 0;
@@ -29483,6 +29541,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
+  case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
@@ -43285,89 +43344,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
-// Turn it into series of XORs and a setnp.
-static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
-                             const X86Subtarget &Subtarget) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // RHS needs to be 1.
-  if (!isOneConstant(N1))
-    return SDValue();
-
-  // Popcnt may be truncated.
-  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
-    N0 = N0.getOperand(0);
-
-  // LHS needs to be a single use CTPOP.
-  if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
-    return SDValue();
-
-  EVT VT = N0.getValueType();
-
-  // We only support 64-bit and 32-bit. 64-bit requires special handling
-  // unless the 64-bit popcnt instruction is legal.
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue X = N0.getOperand(0);
-
-  // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
-  if (DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
-    X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
-    SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
-                                DAG.getConstant(0, DL, MVT::i8));
-    // Copy the inverse of the parity flag into a register with setcc.
-    SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
-    // Extend or truncate to the original type.
-    return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0));
-  }
-
-  // If this is 64-bit, its always best to xor the two 32-bit pieces together
-  // even if we have popcnt.
-  if (VT == MVT::i64) {
-    SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
-                             DAG.getNode(ISD::SRL, DL, VT, X,
-                                         DAG.getConstant(32, DL, MVT::i8)));
-    SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-    X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
-    // Generate a 32-bit parity idiom. This will bring us back here if we need
-    // to expand it too.
-    SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
-                                 DAG.getConstant(1, DL, MVT::i32));
-    return DAG.getZExtOrTrunc(Parity, DL, N->getValueType(0));
-  }
-  assert(VT == MVT::i32 && "Unexpected VT!");
-
-  // Xor the high and low 16-bits together using a 32-bit operation.
-  SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
-                             DAG.getConstant(16, DL, MVT::i8));
-  X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
-
-  // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
-  // This should allow an h-reg to be used to save a shift.
-  // FIXME: We only get an h-reg in 32-bit mode.
-  SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
-                           DAG.getNode(ISD::SRL, DL, VT, X,
-                                       DAG.getConstant(8, DL, MVT::i8)));
-  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
-  SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
-  SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
-
-  // Copy the inverse of the parity flag into a register with setcc.
-  SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
-  // Extend or truncate to the original type.
-  return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0));
-}
-
-
 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
 // Where C is a mask containing the same number of bits as the setcc and
 // where the setcc will freely 0 upper bits of k-register. We can replace the
@@ -43459,10 +43435,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // This must be done before legalization has expanded the ctpop.
-  if (SDValue V = combineParity(N, DAG, Subtarget))
-    return V;
-
   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
   // TODO: Support multiple SrcOps.
   if (VT == MVT::i1) {

diff  --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
new file mode 100644
index 000000000000..bdddb6f1069c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+define i4 @parity_4(i4 %x) {
+; CHECK-LABEL: parity_4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xf
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; CHECK-LABEL: parity_8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; CHECK-LABEL: parity_16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i17 @parity_17(i17 %x) {
+; CHECK-LABEL: parity_17:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x1ffff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
+define i32 @parity_32(i32 %x) {
+; CHECK-LABEL: parity_32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w0, w0, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = and i32 %1, 1
+  ret i32 %2
+}
+
+define i64 @parity_64(i64 %x) {
+; CHECK-LABEL: parity_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, x0, lsr #32
+; CHECK-NEXT:    eor x8, x8, x8, lsr #16
+; CHECK-NEXT:    eor x8, x8, x8, lsr #8
+; CHECK-NEXT:    eor x8, x8, x8, lsr #4
+; CHECK-NEXT:    eor x8, x8, x8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and x0, x8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = and i64 %1, 1
+  ret i64 %2
+}
+
+define i32 @parity_64_trunc(i64 %x) {
+; CHECK-LABEL: parity_64_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor x8, x0, x0, lsr #32
+; CHECK-NEXT:    eor x8, x8, x8, lsr #16
+; CHECK-NEXT:    eor x8, x8, x8, lsr #8
+; CHECK-NEXT:    eor x8, x8, x8, lsr #4
+; CHECK-NEXT:    eor x8, x8, x8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = trunc i64 %1 to i32
+  %3 = and i32 %2, 1
+  ret i32 %3
+}
+
+define i8 @parity_32_trunc(i32 %x) {
+; CHECK-LABEL: parity_32_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w0, w0, lsr #16
+; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = trunc i32 %1 to i8
+  %3 = and i8 %2, 1
+  ret i8 %3
+}
+
+define i32 @parity_8_zext(i8 %x) {
+; CHECK-LABEL: parity_8_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %a = zext i8 %x to i32
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+define i32 @parity_8_mask(i32 %x) {
+; CHECK-LABEL: parity_8_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    eor w8, w8, w8, lsr #4
+; CHECK-NEXT:    eor w8, w8, w8, lsr #2
+; CHECK-NEXT:    eor w8, w8, w8, lsr #1
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %a = and i32 %x, 255
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
+declare i32 @llvm.ctpop.i32(i32 %x)
+declare i64 @llvm.ctpop.i64(i64 %x)

diff  --git a/llvm/test/CodeGen/ARM/parity.ll b/llvm/test/CodeGen/ARM/parity.ll
new file mode 100644
index 000000000000..40c0d7bd32f1
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/parity.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 | FileCheck %s
+
+define i4 @parity_4(i4 %x) {
+; CHECK-LABEL: parity_4:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; CHECK-LABEL: parity_8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; CHECK-LABEL: parity_16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i17 @parity_17(i17 %x) {
+; CHECK-LABEL: parity_17:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    bfc r0, #17, #15
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
+define i32 @parity_32(i32 %x) {
+; CHECK-LABEL: parity_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = and i32 %1, 1
+  ret i32 %2
+}
+
+define i64 @parity_64(i64 %x) {
+; CHECK-LABEL: parity_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    mov r1, #0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = and i64 %1, 1
+  ret i64 %2
+}
+
+define i32 @parity_64_trunc(i64 %x) {
+; CHECK-LABEL: parity_64_trunc:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
+  %2 = trunc i64 %1 to i32
+  %3 = and i32 %2, 1
+  ret i32 %3
+}
+
+define i8 @parity_32_trunc(i32 %x) {
+; CHECK-LABEL: parity_32_trunc:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    eor r0, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
+  %2 = trunc i32 %1 to i8
+  %3 = and i8 %2, 1
+  ret i8 %3
+}
+
+define i32 @parity_8_zext(i8 %x) {
+; CHECK-LABEL: parity_8_zext:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %a = zext i8 %x to i32
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+define i32 @parity_8_mask(i32 %x) {
+; CHECK-LABEL: parity_8_mask:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    eor r0, r0, r0, lsr #4
+; CHECK-NEXT:    eor r0, r0, r0, lsr #2
+; CHECK-NEXT:    eor r0, r0, r0, lsr #1
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    bx lr
+  %a = and i32 %x, 255
+  %b = tail call i32 @llvm.ctpop.i32(i32 %a)
+  %c = and i32 %b, 1
+  ret i32 %c
+}
+
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
+declare i32 @llvm.ctpop.i32(i32 %x)
+declare i64 @llvm.ctpop.i64(i64 %x)

diff  --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 6289ab482426..d7344a4a2ed7 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -4,6 +4,187 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
 
+define i4 @parity_4(i4 %x) {
+; X86-NOPOPCNT-LABEL: parity_4:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    testb $15, {{[0-9]+}}(%esp)
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_4:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    testb $15, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_4:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    testb $15, {{[0-9]+}}(%esp)
+; X86-POPCNT-NEXT:    setnp %al
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_4:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    testb $15, %dil
+; X64-POPCNT-NEXT:    setnp %al
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
+  %2 = and i4 %1, 1
+  ret i4 %2
+}
+
+define i8 @parity_8(i8 %x) {
+; X86-NOPOPCNT-LABEL: parity_8:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_8:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    testb %dil, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_8:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-POPCNT-NEXT:    setnp %al
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_8:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    testb %dil, %dil
+; X64-POPCNT-NEXT:    setnp %al
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
+  %2 = and i8 %1, 1
+  ret i8 %2
+}
+
+define i16 @parity_16(i16 %x) {
+; X86-NOPOPCNT-LABEL: parity_16:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_16:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_16:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    popcntw {{[0-9]+}}(%esp), %ax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_16:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntw %di, %ax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
+  %2 = and i16 %1, 1
+  ret i16 %2
+}
+
+define i16 @parity_16_load(i16* %x) {
+; X86-NOPOPCNT-LABEL: parity_16_load:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOPOPCNT-NEXT:    movzwl (%eax), %ecx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_16_load:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movzwl (%rdi), %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_16_load:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    popcntw (%eax), %ax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_16_load:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntw (%rdi), %ax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-POPCNT-NEXT:    retq
+  %1 = load i16, i16* %x
+  %2 = tail call i16 @llvm.ctpop.i16(i16 %1)
+  %3 = and i16 %2, 1
+  ret i16 %3
+}
+
+define i17 @parity_17(i17 %x) {
+; X86-NOPOPCNT-LABEL: parity_17:
+; X86-NOPOPCNT:       # %bb.0:
+; X86-NOPOPCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOPOPCNT-NEXT:    movl %ecx, %eax
+; X86-NOPOPCNT-NEXT:    andl $131071, %eax # imm = 0x1FFFF
+; X86-NOPOPCNT-NEXT:    movl %eax, %edx
+; X86-NOPOPCNT-NEXT:    shrl $16, %edx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %edx
+; X86-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X86-NOPOPCNT-NEXT:    xorb %dl, %ch
+; X86-NOPOPCNT-NEXT:    setnp %al
+; X86-NOPOPCNT-NEXT:    retl
+;
+; X64-NOPOPCNT-LABEL: parity_17:
+; X64-NOPOPCNT:       # %bb.0:
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    andl $131071, %eax # imm = 0x1FFFF
+; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    shrl $8, %edi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
+; X64-NOPOPCNT-NEXT:    xorb %cl, %dil
+; X64-NOPOPCNT-NEXT:    setnp %al
+; X64-NOPOPCNT-NEXT:    retq
+;
+; X86-POPCNT-LABEL: parity_17:
+; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    movl $131071, %eax # imm = 0x1FFFF
+; X86-POPCNT-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-POPCNT-NEXT:    popcntl %eax, %eax
+; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: parity_17:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    andl $131071, %edi # imm = 0x1FFFF
+; X64-POPCNT-NEXT:    popcntl %edi, %eax
+; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    retq
+  %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
+  %2 = and i17 %1, 1
+  ret i17 %2
+}
+
 define i32 @parity_32(i32 %x) {
 ; X86-NOPOPCNT-LABEL: parity_32:
 ; X86-NOPOPCNT:       # %bb.0:
@@ -157,14 +338,14 @@ define i8 @parity_32_trunc(i32 %x) {
 ; X86-POPCNT-LABEL: parity_32_trunc:
 ; X86-POPCNT:       # %bb.0:
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    andb $1, %al
+; X86-POPCNT-NEXT:    andl $1, %eax
 ; X86-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-POPCNT-NEXT:    retl
 ;
 ; X64-POPCNT-LABEL: parity_32_trunc:
 ; X64-POPCNT:       # %bb.0:
 ; X64-POPCNT-NEXT:    popcntl %edi, %eax
-; X64-POPCNT-NEXT:    andb $1, %al
+; X64-POPCNT-NEXT:    andl $1, %eax
 ; X64-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-POPCNT-NEXT:    retq
   %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -241,5 +422,9 @@ define i32 @parity_8_mask(i32 %x) {
   ret i32 %c
 }
 
+declare i4 @llvm.ctpop.i4(i4 %x)
+declare i8 @llvm.ctpop.i8(i8 %x)
+declare i16 @llvm.ctpop.i16(i16 %x)
+declare i17 @llvm.ctpop.i17(i17 %x)
 declare i32 @llvm.ctpop.i32(i32 %x)
 declare i64 @llvm.ctpop.i64(i64 %x)

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index fb019ffd99e9..06a428c514a7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -53,7 +53,7 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $3, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <2 x i64> %0 to <2 x i1>
@@ -103,7 +103,7 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = trunc <4 x i32> %0 to <4 x i1>
@@ -251,7 +251,7 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL-NEXT:    vpsllq $63, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vptestmq %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -974,7 +974,7 @@ define i1 @icmp_v2i64_v2i1(<2 x i64>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $3, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <2 x i64> %0, zeroinitializer
@@ -1025,7 +1025,7 @@ define i1 @icmp_v4i32_v4i1(<4 x i32>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <4 x i32> %0, zeroinitializer
@@ -1214,7 +1214,7 @@ define i1 @icmp_v4i64_v4i1(<4 x i64>) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb %al, %al
+; AVX512VL-NEXT:    testb $15, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq


        


More information about the llvm-commits mailing list