[llvm] r331091 - [X86] Make the STTNI flag intrinsics use the flags from pcmpestrm/pcmpistrm if the mask instrinsics are also used in the same basic block.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 27 15:15:33 PDT 2018


Author: ctopper
Date: Fri Apr 27 15:15:33 2018
New Revision: 331091

URL: http://llvm.org/viewvc/llvm-project?rev=331091&view=rev
Log:
[X86] Make the STTNI flag intrinsics use the flags from pcmpestrm/pcmpistrm if the mask instrinsics are also used in the same basic block.

Summary:
Previously the flag intrinsics always used the index instructions even if a mask instruction also exists.

To fix fix this I've created a single ISD node type that returns index, mask, and flags. The SelectionDAG CSE process will merge all flavors of intrinsics with the same inputs to a s ingle node. Then during isel we just have to look at which results are used to know what instruction to generate. If both mask and index are used we'll need to emit two instructions. But for all other cases we can emit a single instruction.

Since I had to do manual isel anyway, I've removed the pseudo instructions and custom inserter code that was working around tablegen limitations with multiple implicit defs.

I've also renamed the recently added sse42.ll test case to sttni.ll since it focuses on that subset of the sse4.2 instructions.

Reviewers: chandlerc, RKSimon, spatel

Reviewed By: chandlerc

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D46202

Added:
    llvm/trunk/test/CodeGen/X86/sttni.ll
      - copied, changed from r331090, llvm/trunk/test/CodeGen/X86/sse42.ll
Removed:
    llvm/trunk/test/CodeGen/X86/sse42.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/lib/Target/X86/X86InstrSSE.td

Modified: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=331091&r1=331090&r2=331091&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp Fri Apr 27 15:15:33 2018
@@ -226,7 +226,7 @@ namespace {
                      SDValue &Index, SDValue &Disp,
                      SDValue &Segment);
 
-    // Convience method where P is also root.
+    // Convenience method where P is also root.
     bool tryFoldLoad(SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
                      SDValue &Index, SDValue &Disp,
@@ -234,6 +234,12 @@ namespace {
       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
     }
 
+    // Try to fold a vector load. This makes sure the load isn't non-temporal.
+    bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+                        SDValue &Base, SDValue &Scale,
+                        SDValue &Index, SDValue &Disp,
+                        SDValue &Segment);
+
     /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
@@ -449,6 +455,12 @@ namespace {
     bool matchBEXTRFromAnd(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
+
+    MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+                                const SDLoc &dl, MVT VT, SDNode *Node);
+    MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+                                const SDLoc &dl, MVT VT, SDNode *Node,
+                                SDValue &InFlag);
   };
 }
 
@@ -2006,6 +2018,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
+bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+                                     SDValue &Base, SDValue &Scale,
+                                     SDValue &Index, SDValue &Disp,
+                                     SDValue &Segment) {
+  if (!ISD::isNON_EXTLoad(N.getNode()) ||
+      useNonTemporalLoad(cast<LoadSDNode>(N)) ||
+      !IsProfitableToFold(N, P, Root) ||
+      !IsLegalToFold(N, P, Root, OptLevel))
+    return false;
+
+  return selectAddr(N.getNode(),
+                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
 /// Return an SDNode that returns the value of the global base register.
 /// Output instructions required to initialize the global base register,
 /// if necessary.
@@ -2563,6 +2589,83 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(
   return true;
 }
 
+// Emit a PCMISTR(I/M) instruction.
+MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
+                                             bool MayFoldLoad, const SDLoc &dl,
+                                             MVT VT, SDNode *Node) {
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+  SDValue Imm = Node->getOperand(2);
+  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+  // If there is a load, it will be behind a bitcast. We don't need to check
+  // alignment on this load.
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
+      tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
+                     Tmp3, Tmp4)) {
+    SDValue Load = N1.getOperand(0);
+    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+                      Load.getOperand(0) };
+    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
+    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    // Update the chain.
+    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    // Record the mem-refs
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+    CNode->setMemRefs(MemOp, MemOp + 1);
+    return CNode;
+  }
+
+  SDValue Ops[] = { N0, N1, Imm };
+  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
+  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+  return CNode;
+}
+
+// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
+// to emit a second instruction after this one. This is needed since we have two
+// copyToReg nodes glued before this and we need to continue that glue through.
+MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
+                                             bool MayFoldLoad, const SDLoc &dl,
+                                             MVT VT, SDNode *Node,
+                                             SDValue &InFlag) {
+  SDValue N0 = Node->getOperand(0);
+  SDValue N2 = Node->getOperand(2);
+  SDValue Imm = Node->getOperand(4);
+  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+  // If there is a load, it will be behind a bitcast. We don't need to check
+  // alignment on this load.
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
+      tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
+                     Tmp3, Tmp4)) {
+    SDValue Load = N2.getOperand(0);
+    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+                      Load.getOperand(0), InFlag };
+    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
+    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    InFlag = SDValue(CNode, 3);
+    // Update the chain.
+    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    // Record the mem-refs
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+    CNode->setMemRefs(MemOp, MemOp + 1);
+    return CNode;
+  }
+
+  SDValue Ops[] = { N0, N2, Imm, InFlag };
+  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
+  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+  InFlag = SDValue(CNode, 2);
+  return CNode;
+}
+
 /// If the high bits of an 'and' operand are known zero, try setting the
 /// high bits of an 'and' constant operand to produce a smaller encoding by
 /// creating a small, sign-extended negative immediate rather than a large
@@ -3184,6 +3287,70 @@ void X86DAGToDAGISel::Select(SDNode *Nod
     }
     break;
   }
+  case X86ISD::PCMPISTR: {
+    if (!Subtarget->hasSSE42())
+      break;
+
+    bool NeedIndex = !SDValue(Node, 0).use_empty();
+    bool NeedMask = !SDValue(Node, 1).use_empty();
+    // We can't fold a load if we are going to make two instructions.
+    bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+    MachineSDNode *CNode;
+    if (NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
+      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
+      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+    }
+    if (NeedIndex || !NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
+      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
+      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    }
+
+    // Connect the flag usage to the last instruction created.
+    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 0));
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case X86ISD::PCMPESTR: {
+    if (!Subtarget->hasSSE42())
+      break;
+
+    // Copy the two implicit register inputs.
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
+                                          Node->getOperand(1),
+                                          SDValue()).getValue(1);
+    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
+                                  Node->getOperand(3), InFlag).getValue(1);
+
+    bool NeedIndex = !SDValue(Node, 0).use_empty();
+    bool NeedMask = !SDValue(Node, 1).use_empty();
+    // We can't fold a load if we are going to make two instructions.
+    bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+    MachineSDNode *CNode;
+    if (NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
+      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
+                           InFlag);
+      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+    }
+    if (NeedIndex || !NeedMask) {
+      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
+      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
+      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
+      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    }
+    // Connect the flag usage to the last instruction created.
+    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+
   case ISD::STORE:
     if (foldLoadStoreIntoMemOperand(Node))
       return;

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=331091&r1=331090&r2=331091&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Apr 27 15:15:33 2018
@@ -20947,50 +20947,50 @@ SDValue X86TargetLowering::LowerINTRINSI
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_sse42_pcmpistria128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpestria128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpistric128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpestric128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpistrio128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpestrio128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpistris128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpestris128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpistriz128:
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_E;
       break;
     case Intrinsic::x86_sse42_pcmpestriz128:
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_E;
       break;
     }
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
-    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
-    SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
+    SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
@@ -20998,15 +20998,28 @@ SDValue X86TargetLowering::LowerINTRINSI
   case Intrinsic::x86_sse42_pcmpestri128: {
     unsigned Opcode;
     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
-      Opcode = X86ISD::PCMPISTRI;
+      Opcode = X86ISD::PCMPISTR;
     else
-      Opcode = X86ISD::PCMPESTRI;
+      Opcode = X86ISD::PCMPESTR;
 
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
-    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
 
+  case Intrinsic::x86_sse42_pcmpistrm128:
+  case Intrinsic::x86_sse42_pcmpestrm128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
+      Opcode = X86ISD::PCMPISTR;
+    else
+      Opcode = X86ISD::PCMPESTR;
+
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
+  }
+
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -25794,8 +25807,8 @@ const char *X86TargetLowering::getTarget
   case X86ISD::VGETMANT_RND:       return "X86ISD::VGETMANT_RND";
   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
   case X86ISD::VGETMANTS_RND:      return "X86ISD::VGETMANTS_RND";
-  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
-  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
+  case X86ISD::PCMPESTR:           return "X86ISD::PCMPESTR";
+  case X86ISD::PCMPISTR:           return "X86ISD::PCMPISTR";
   case X86ISD::XTEST:              return "X86ISD::XTEST";
   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
@@ -26179,79 +26192,6 @@ static MachineBasicBlock *emitXBegin(Mac
   return sinkMBB;
 }
 
-// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
-// or XMM0_V32I8 in AVX all of this code can be replaced with that
-// in the .td file.
-static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
-                                       const TargetInstrInfo *TII) {
-  unsigned Opc;
-  switch (MI.getOpcode()) {
-  default: llvm_unreachable("illegal opcode!");
-  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
-  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
-  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
-  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
-  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
-  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
-  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
-  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
-  }
-
-  DebugLoc dl = MI.getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
-  unsigned NumArgs = MI.getNumOperands();
-  for (unsigned i = 1; i < NumArgs; ++i) {
-    MachineOperand &Op = MI.getOperand(i);
-    if (!(Op.isReg() && Op.isImplicit()))
-      MIB.add(Op);
-  }
-  if (MI.hasOneMemOperand())
-    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
-      .addReg(X86::XMM0);
-
-  MI.eraseFromParent();
-  return BB;
-}
-
-// FIXME: Custom handling because TableGen doesn't support multiple implicit
-// defs in an instruction pattern
-static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
-                                       const TargetInstrInfo *TII) {
-  unsigned Opc;
-  switch (MI.getOpcode()) {
-  default: llvm_unreachable("illegal opcode!");
-  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
-  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
-  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
-  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
-  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
-  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
-  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
-  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
-  }
-
-  DebugLoc dl = MI.getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
-  unsigned NumArgs = MI.getNumOperands(); // remove the results
-  for (unsigned i = 1; i < NumArgs; ++i) {
-    MachineOperand &Op = MI.getOperand(i);
-    if (!(Op.isReg() && Op.isImplicit()))
-      MIB.add(Op);
-  }
-  if (MI.hasOneMemOperand())
-    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
-      .addReg(X86::ECX);
-
-  MI.eraseFromParent();
-  return BB;
-}
-
 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
                                      const X86Subtarget &Subtarget) {
   DebugLoc dl = MI.getDebugLoc();
@@ -28167,32 +28107,6 @@ X86TargetLowering::EmitInstrWithCustomIn
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
-    // String/text processing lowering.
-  case X86::PCMPISTRM128REG:
-  case X86::VPCMPISTRM128REG:
-  case X86::PCMPISTRM128MEM:
-  case X86::VPCMPISTRM128MEM:
-  case X86::PCMPESTRM128REG:
-  case X86::VPCMPESTRM128REG:
-  case X86::PCMPESTRM128MEM:
-  case X86::VPCMPESTRM128MEM:
-    assert(Subtarget.hasSSE42() &&
-           "Target must have SSE4.2 or AVX features enabled");
-    return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
-
-  // String/text processing lowering.
-  case X86::PCMPISTRIREG:
-  case X86::VPCMPISTRIREG:
-  case X86::PCMPISTRIMEM:
-  case X86::VPCMPISTRIMEM:
-  case X86::PCMPESTRIREG:
-  case X86::VPCMPESTRIREG:
-  case X86::PCMPESTRIMEM:
-  case X86::VPCMPESTRIMEM:
-    assert(Subtarget.hasSSE42() &&
-           "Target must have SSE4.2 or AVX features enabled");
-    return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
-
   // Thread synchronization.
   case X86::MONITOR:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=331091&r1=331090&r2=331091&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Fri Apr 27 15:15:33 2018
@@ -576,8 +576,13 @@ namespace llvm {
       RDSEED,
 
       // SSE42 string comparisons.
-      PCMPISTRI,
-      PCMPESTRI,
+      // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+      // will emit one or two instructions based on which results are used. If
+      // flags and index/mask this allows us to use a single instruction since
+      // we won't have to pick and opcode for flags. Instead we can rely on the
+      // DAG to CSE everything and decide at isel.
+      PCMPISTR,
+      PCMPESTR,
 
       // Test if in transactional execution.
       XTEST,

Modified: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td?rev=331091&r1=331090&r2=331091&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td Fri Apr 27 15:15:33 2018
@@ -555,17 +555,6 @@ def X86RndScalesRnd : SDNode<"X86ISD::VR
 def X86ReducesRnd   : SDNode<"X86ISD::VREDUCES_RND",   SDTFPBinOpImmRound>;
 def X86GetMantsRnd  : SDNode<"X86ISD::VGETMANTS_RND",  SDTFPBinOpImmRound>;
 
-def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                                         SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
-                                         SDTCisVT<4, i8>]>;
-def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                                         SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
-                                         SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
-                                         SDTCisVT<6, i8>]>;
-
-def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
-def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
-
 def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
                               [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
 def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=331091&r1=331090&r2=331091&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Fri Apr 27 15:15:33 2018
@@ -632,9 +632,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::PABSDrr,         X86::PABSDrm,             TB_ALIGN_16 },
     { X86::PABSWrr,         X86::PABSWrm,             TB_ALIGN_16 },
     { X86::PCMPESTRIrr,     X86::PCMPESTRIrm,         0 },
-    { X86::PCMPESTRM128rr,  X86::PCMPESTRM128rm,      0 },
+    { X86::PCMPESTRMrr,     X86::PCMPESTRMrm,         0 },
     { X86::PCMPISTRIrr,     X86::PCMPISTRIrm,         0 },
-    { X86::PCMPISTRM128rr,  X86::PCMPISTRM128rm,      0 },
+    { X86::PCMPISTRMrr,     X86::PCMPISTRMrm,         0 },
     { X86::PHMINPOSUWrr,    X86::PHMINPOSUWrm,        TB_ALIGN_16 },
     { X86::PMOVSXBDrr,      X86::PMOVSXBDrm,          TB_NO_REVERSE },
     { X86::PMOVSXBQrr,      X86::PMOVSXBQrm,          TB_NO_REVERSE },
@@ -736,10 +736,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VPABSDrr,        X86::VPABSDrm,            0 },
     { X86::VPABSWrr,        X86::VPABSWrm,            0 },
     { X86::VPCMPESTRIrr,    X86::VPCMPESTRIrm,        0 },
-    { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm,     0 },
+    { X86::VPCMPESTRMrr,    X86::VPCMPESTRMrm,        0 },
     { X86::VPCMPISTRIrr,    X86::VPCMPISTRIrm,        0 },
-    { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm,     0 },
-    { X86::VPHMINPOSUWrr,   X86::VPHMINPOSUWrm,      0 },
+    { X86::VPCMPISTRMrr,    X86::VPCMPISTRMrm,        0 },
+    { X86::VPHMINPOSUWrr,   X86::VPHMINPOSUWrm,       0 },
     { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
     { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
     { X86::VPMOVSXBDrr,     X86::VPMOVSXBDrm,         TB_NO_REVERSE },

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=331091&r1=331090&r2=331091&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Fri Apr 27 15:15:33 2018
@@ -6383,25 +6383,6 @@ let Constraints = "$src1 = $dst" in
 // SSE4.2 - String/text Processing Instructions
 //===----------------------------------------------------------------------===//
 
-// Packed Compare Implicit Length Strings, Return Mask
-multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
-    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
-                                                  imm:$src3))]>;
-  def MEM : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
-    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
-                       (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
-                         Requires<[HasAVX]>, VEX_WIG;
-  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", loadv2i64>,
-                         Requires<[UseSSE42]>;
-}
-
 multiclass pcmpistrm_SS42AI<string asm> {
   def rr : SS42AI<0x62, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src2, u8imm:$src3),
@@ -6416,27 +6397,8 @@ multiclass pcmpistrm_SS42AI<string asm>
 
 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
-  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
-}
-
-// Packed Compare Explicit Length Strings, Return Mask
-multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
-    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
-                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
-  def MEM : PseudoI<(outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
-    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
-                       (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
-                         Requires<[HasAVX]>;
-  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", loadv2i64>,
-                         Requires<[UseSSE42]>;
+  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
 multiclass SS42AI_pcmpestrm<string asm> {
@@ -6453,27 +6415,8 @@ multiclass SS42AI_pcmpestrm<string asm>
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
-  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
-}
-
-// Packed Compare Implicit Length Strings, Return Index
-multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
-    [(set GR32:$dst, EFLAGS,
-      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
-  def MEM : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
-    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
-                              (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
-                      Requires<[HasAVX]>, VEX_WIG;
-  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", loadv2i64>,
-                      Requires<[UseSSE42]>;
+  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
 multiclass SS42AI_pcmpistri<string asm> {
@@ -6494,26 +6437,6 @@ let Defs = [ECX, EFLAGS], hasSideEffects
   defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
 }
 
-// Packed Compare Explicit Length Strings, Return Index
-multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
-  def REG : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
-    [(set GR32:$dst, EFLAGS,
-      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
-  def MEM : PseudoI<(outs GR32:$dst),
-                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
-    [(set GR32:$dst, EFLAGS,
-      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
-       imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in {
-  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
-                      Requires<[HasAVX]>;
-  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", loadv2i64>,
-                      Requires<[UseSSE42]>;
-}
-
 multiclass SS42AI_pcmpestri<string asm> {
   def rr : SS42AI<0x61, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src3, u8imm:$src5),

Removed: llvm/trunk/test/CodeGen/X86/sse42.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse42.ll?rev=331090&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse42.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse42.ll (removed)
@@ -1,964 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X64
-
-declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
-declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
-declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
-declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
-
-define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_reg_eq_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_reg_eq_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_reg_idx_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_reg_idx_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
-  ret i32 %idx
-}
-
-define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_reg_diff_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movl 12(%ebp), %edx
-; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X32-NEXT:    cmpl $16, %ecx
-; X32-NEXT:    jne .LBB2_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    jmp .LBB2_3
-; X32-NEXT:  .LBB2_2: # %compare
-; X32-NEXT:    movdqa %xmm0, (%esp)
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    movb (%esp,%ecx), %al
-; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT:    subb 16(%esp,%ecx), %al
-; X32-NEXT:  .LBB2_3: # %exit
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_reg_diff_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $16, %ecx
-; X64-NEXT:    jne .LBB2_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB2_2: # %compare
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movb -24(%rsp,%rcx), %al
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subb -40(%rsp,%rcx), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-entry:
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
-  %eq = icmp eq i32 %idx, 16
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
-  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
-  %sub = sub i8 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i8 %result to i32
-  ret i32 %result_ext
-}
-
-define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_mem_eq_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movdqu (%esi), %xmm0
-; X32-NEXT:    pcmpestri $24, (%ecx), %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    popl %esi
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_mem_eq_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %ecx, %edx
-; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
-  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
-  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
-  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_mem_idx_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movdqu (%esi), %xmm0
-; X32-NEXT:    pcmpestri $24, (%ecx), %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    popl %esi
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_mem_idx_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %ecx, %edx
-; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
-  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
-  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
-  ret i32 %idx
-}
-
-define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_mem_diff_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 12(%ebp), %eax
-; X32-NEXT:    movl 20(%ebp), %edx
-; X32-NEXT:    movl 16(%ebp), %ecx
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    movdqu (%esi), %xmm1
-; X32-NEXT:    movdqu (%ecx), %xmm0
-; X32-NEXT:    pcmpestri $24, %xmm0, %xmm1
-; X32-NEXT:    cmpl $16, %ecx
-; X32-NEXT:    jne .LBB5_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    jmp .LBB5_3
-; X32-NEXT:  .LBB5_2: # %compare
-; X32-NEXT:    movdqa %xmm1, (%esp)
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    movb (%esp,%ecx), %al
-; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    subb 16(%esp,%ecx), %al
-; X32-NEXT:  .LBB5_3: # %exit
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    leal -4(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_mem_diff_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm1
-; X64-NEXT:    movdqu (%rdx), %xmm0
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %ecx, %edx
-; X64-NEXT:    pcmpestri $24, %xmm0, %xmm1
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $16, %ecx
-; X64-NEXT:    jne .LBB5_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB5_2: # %compare
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movb -24(%rsp,%rcx), %al
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subb -40(%rsp,%rcx), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
-  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
-  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
-  %eq = icmp eq i32 %idx, 16
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
-  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
-  %sub = sub i8 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i8 %result to i32
-  ret i32 %result_ext
-}
-
-define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_reg_eq_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_reg_eq_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_reg_idx_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_reg_idx_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
-  ret i32 %idx
-}
-
-define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_reg_diff_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movl 12(%ebp), %edx
-; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X32-NEXT:    cmpl $16, %ecx
-; X32-NEXT:    jne .LBB8_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    jmp .LBB8_3
-; X32-NEXT:  .LBB8_2: # %compare
-; X32-NEXT:    movdqa %xmm0, (%esp)
-; X32-NEXT:    addl %ecx, %ecx
-; X32-NEXT:    andl $14, %ecx
-; X32-NEXT:    movzwl (%esp,%ecx), %eax
-; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT:    subw 16(%esp,%ecx), %ax
-; X32-NEXT:  .LBB8_3: # %exit
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_reg_diff_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $16, %ecx
-; X64-NEXT:    jne .LBB8_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB8_2: # %compare
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
-  %eq = icmp eq i32 %idx, 16
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
-  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
-  %sub = sub i16 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i16 %result to i32
-  ret i32 %result_ext
-}
-
-define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_mem_eq_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movdqu (%esi), %xmm0
-; X32-NEXT:    pcmpestri $25, (%ecx), %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    popl %esi
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_mem_eq_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %ecx, %edx
-; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
-  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
-  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_mem_idx_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movdqu (%esi), %xmm0
-; X32-NEXT:    pcmpestri $25, (%ecx), %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    popl %esi
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_mem_idx_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %ecx, %edx
-; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
-  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
-  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
-  ret i32 %idx
-}
-
-define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
-; X32-LABEL: pcmpestri_mem_diff_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 12(%ebp), %eax
-; X32-NEXT:    movl 20(%ebp), %edx
-; X32-NEXT:    movl 16(%ebp), %ecx
-; X32-NEXT:    movl 8(%ebp), %esi
-; X32-NEXT:    movdqu (%esi), %xmm1
-; X32-NEXT:    movdqu (%ecx), %xmm0
-; X32-NEXT:    pcmpestri $25, %xmm0, %xmm1
-; X32-NEXT:    cmpl $8, %ecx
-; X32-NEXT:    jne .LBB11_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    jmp .LBB11_3
-; X32-NEXT:  .LBB11_2: # %compare
-; X32-NEXT:    movdqa %xmm1, (%esp)
-; X32-NEXT:    addl %ecx, %ecx
-; X32-NEXT:    andl $14, %ecx
-; X32-NEXT:    movzwl (%esp,%ecx), %eax
-; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    subw 16(%esp,%ecx), %ax
-; X32-NEXT:  .LBB11_3: # %exit
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    leal -4(%ebp), %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpestri_mem_diff_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm1
-; X64-NEXT:    movdqu (%rdx), %xmm0
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %ecx, %edx
-; X64-NEXT:    pcmpestri $25, %xmm0, %xmm1
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $8, %ecx
-; X64-NEXT:    jne .LBB11_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB11_2: # %compare
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
-  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
-  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
-  %eq = icmp eq i32 %idx, 8
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
-  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
-  %sub = sub i16 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i16 %result to i32
-  ret i32 %result_ext
-}
-
-define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
-; X32-LABEL: pcmpistri_reg_eq_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_reg_eq_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
-; X32-LABEL: pcmpistri_reg_idx_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_reg_idx_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
-  ret i32 %idx
-}
-
-define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
-; X32-LABEL: pcmpistri_reg_diff_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X32-NEXT:    cmpl $16, %ecx
-; X32-NEXT:    jne .LBB14_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    retl
-; X32-NEXT:  .LBB14_2: # %compare
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movdqa %xmm0, (%esp)
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    movb (%esp,%ecx), %al
-; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT:    subb 16(%esp,%ecx), %al
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_reg_diff_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $16, %ecx
-; X64-NEXT:    jne .LBB14_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB14_2: # %compare
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movb -24(%rsp,%rcx), %al
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subb -40(%rsp,%rcx), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-entry:
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
-  %eq = icmp eq i32 %idx, 16
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
-  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
-  %sub = sub i8 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i8 %result to i32
-  ret i32 %result_ext
-}
-
-define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
-; X32-LABEL: pcmpistri_mem_eq_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movdqu (%ecx), %xmm0
-; X32-NEXT:    pcmpistri $24, (%eax), %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_mem_eq_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
-  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
-  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
-  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
-; X32-LABEL: pcmpistri_mem_idx_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movdqu (%ecx), %xmm0
-; X32-NEXT:    pcmpistri $24, (%eax), %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_mem_idx_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
-  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
-  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
-  ret i32 %idx
-}
-
-define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
-; X32-LABEL: pcmpistri_mem_diff_i8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 12(%ebp), %eax
-; X32-NEXT:    movl 8(%ebp), %ecx
-; X32-NEXT:    movdqu (%ecx), %xmm1
-; X32-NEXT:    movdqu (%eax), %xmm0
-; X32-NEXT:    pcmpistri $24, %xmm0, %xmm1
-; X32-NEXT:    cmpl $16, %ecx
-; X32-NEXT:    jne .LBB17_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    jmp .LBB17_3
-; X32-NEXT:  .LBB17_2: # %compare
-; X32-NEXT:    movdqa %xmm1, (%esp)
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    movb (%esp,%ecx), %al
-; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    subb 16(%esp,%ecx), %al
-; X32-NEXT:  .LBB17_3: # %exit
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_mem_diff_i8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm1
-; X64-NEXT:    movdqu (%rsi), %xmm0
-; X64-NEXT:    pcmpistri $24, %xmm0, %xmm1
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $16, %ecx
-; X64-NEXT:    jne .LBB17_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB17_2: # %compare
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $15, %ecx
-; X64-NEXT:    movb -24(%rsp,%rcx), %al
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subb -40(%rsp,%rcx), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
-  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
-  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
-  %eq = icmp eq i32 %idx, 16
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
-  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
-  %sub = sub i8 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i8 %result to i32
-  ret i32 %result_ext
-}
-
-define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
-; X32-LABEL: pcmpistri_reg_eq_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_reg_eq_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
-; X32-LABEL: pcmpistri_reg_idx_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_reg_idx_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
-  ret i32 %idx
-}
-
-define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
-; X32-LABEL: pcmpistri_reg_diff_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X32-NEXT:    cmpl $16, %ecx
-; X32-NEXT:    jne .LBB20_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    retl
-; X32-NEXT:  .LBB20_2: # %compare
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movdqa %xmm0, (%esp)
-; X32-NEXT:    addl %ecx, %ecx
-; X32-NEXT:    andl $14, %ecx
-; X32-NEXT:    movzwl (%esp,%ecx), %eax
-; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT:    subw 16(%esp,%ecx), %ax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_reg_diff_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $16, %ecx
-; X64-NEXT:    jne .LBB20_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_2: # %compare
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
-  %eq = icmp eq i32 %idx, 16
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
-  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
-  %sub = sub i16 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i16 %result to i32
-  ret i32 %result_ext
-}
-
-define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
-; X32-LABEL: pcmpistri_mem_eq_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movdqu (%ecx), %xmm0
-; X32-NEXT:    pcmpistri $25, (%eax), %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_mem_eq_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
-  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
-  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
-  %result = icmp eq i32 %c, 0
-  ret i1 %result
-}
-
-define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
-; X32-LABEL: pcmpistri_mem_idx_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movdqu (%ecx), %xmm0
-; X32-NEXT:    pcmpistri $25, (%eax), %xmm0
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_mem_idx_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
-  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
-  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
-  ret i32 %idx
-}
-
-define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
-; X32-LABEL: pcmpistri_mem_diff_i16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 12(%ebp), %eax
-; X32-NEXT:    movl 8(%ebp), %ecx
-; X32-NEXT:    movdqu (%ecx), %xmm1
-; X32-NEXT:    movdqu (%eax), %xmm0
-; X32-NEXT:    pcmpistri $25, %xmm0, %xmm1
-; X32-NEXT:    cmpl $8, %ecx
-; X32-NEXT:    jne .LBB23_2
-; X32-NEXT:  # %bb.1:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    jmp .LBB23_3
-; X32-NEXT:  .LBB23_2: # %compare
-; X32-NEXT:    movdqa %xmm1, (%esp)
-; X32-NEXT:    addl %ecx, %ecx
-; X32-NEXT:    andl $14, %ecx
-; X32-NEXT:    movzwl (%esp,%ecx), %eax
-; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    subw 16(%esp,%ecx), %ax
-; X32-NEXT:  .LBB23_3: # %exit
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: pcmpistri_mem_diff_i16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdqu (%rdi), %xmm1
-; X64-NEXT:    movdqu (%rsi), %xmm0
-; X64-NEXT:    pcmpistri $25, %xmm0, %xmm1
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    cmpl $8, %ecx
-; X64-NEXT:    jne .LBB23_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB23_2: # %compare
-; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $7, %ecx
-; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
-entry:
-  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
-  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
-  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
-  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
-  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
-  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
-  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
-  %eq = icmp eq i32 %idx, 8
-  br i1 %eq, label %exit, label %compare
-
-compare:
-  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
-  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
-  %sub = sub i16 %lhs_c, %rhs_c
-  br label %exit
-
-exit:
-  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
-  %result_ext = zext i16 %result to i32
-  ret i32 %result_ext
-}

Copied: llvm/trunk/test/CodeGen/X86/sttni.ll (from r331090, llvm/trunk/test/CodeGen/X86/sse42.ll)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sttni.ll?p2=llvm/trunk/test/CodeGen/X86/sttni.ll&p1=llvm/trunk/test/CodeGen/X86/sse42.ll&r1=331090&r2=331091&rev=331091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse42.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sttni.ll Fri Apr 27 15:15:33 2018
@@ -4,8 +4,10 @@
 
 declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
 declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
 
 define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
 ; X32-LABEL: pcmpestri_reg_eq_i8:
@@ -962,3 +964,374 @@ exit:
   %result_ext = zext i16 %result to i32
   ret i32 %result_ext
 }
+
+define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpestr_index_flag:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ebx, %ebx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ecx, (%edi)
+; X32-NEXT:    movl %ebx, (%esi)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestr_index_flag:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    xorl %r10d, %r10d
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movl %ecx, (%r9)
+; X64-NEXT:    movl %r10d, (%r8)
+; X64-NEXT:    retq
+entry:
+  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  store i32 %index, i32* %iptr
+  store i32 %flag, i32* %fptr
+  ret void
+}
+
+define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpestr_mask_flag:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ebx, %ebx
+; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movdqa %xmm0, (%esi)
+; X32-NEXT:    movl %ebx, (%ecx)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestr_mask_flag:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    movdqa %xmm0, (%r8)
+; X64-NEXT:    movl %r9d, (%rcx)
+; X64-NEXT:    retq
+entry:
+  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  store <16 x i8> %mask, <16 x i8>* %mptr
+  store i32 %flag, i32* %fptr
+  ret void
+}
+
+define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind {
+; X32-LABEL: pcmpestr_mask_index:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movdqa %xmm0, %xmm2
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm2
+; X32-NEXT:    movdqa %xmm0, (%edi)
+; X32-NEXT:    movl %ecx, (%esi)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestr_mask_index:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
+; X64-NEXT:    movdqa %xmm0, (%r9)
+; X64-NEXT:    movl %ecx, (%r8)
+; X64-NEXT:    retq
+entry:
+  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  store <16 x i8> %mask, <16 x i8>* %mptr
+  store i32 %index, i32* %iptr
+  ret void
+}
+
+define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpestr_mask_index_flag:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movdqa %xmm0, %xmm2
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %ebx, %ebx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm2
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movdqa %xmm0, (%ebp)
+; X32-NEXT:    movl %ecx, (%edi)
+; X32-NEXT:    movl %ebx, (%esi)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestr_mask_index_flag:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movdqa %xmm0, (%r10)
+; X64-NEXT:    movl %ecx, (%r9)
+; X64-NEXT:    movl %esi, (%r8)
+; X64-NEXT:    retq
+entry:
+  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  store <16 x i8> %mask, <16 x i8>* %mptr
+  store i32 %index, i32* %iptr
+  store i32 %flag, i32* %fptr
+  ret void
+}
+
+define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_index_flag:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    setb %dl
+; X32-NEXT:    movl %ecx, (%esi)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistr_index_flag:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    setb %al
+; X64-NEXT:    movl %ecx, (%rdi)
+; X64-NEXT:    movl %eax, (%rsi)
+; X64-NEXT:    retq
+entry:
+  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  store i32 %index, i32* %iptr
+  store i32 %flag, i32* %fptr
+  ret void
+}
+
+define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_mask_flag:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    setb %cl
+; X32-NEXT:    movdqa %xmm0, (%edx)
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistr_mask_flag:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
+; X64-NEXT:    setb %al
+; X64-NEXT:    movdqa %xmm0, (%rdi)
+; X64-NEXT:    movl %eax, (%rsi)
+; X64-NEXT:    retq
+entry:
+  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  store <16 x i8> %mask, <16 x i8>* %mptr
+  store i32 %flag, i32* %fptr
+  ret void
+}
+
+define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind {
+; X32-LABEL: pcmpistr_mask_index:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movdqa %xmm0, (%edx)
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistr_mask_index:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
+; X64-NEXT:    movdqa %xmm0, (%rdi)
+; X64-NEXT:    movl %ecx, (%rsi)
+; X64-NEXT:    retq
+entry:
+  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  store <16 x i8> %mask, <16 x i8>* %mptr
+  store i32 %index, i32* %iptr
+  ret void
+}
+
+define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_mask_index_flag:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movdqa %xmm0, %xmm2
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0
+; X32-NEXT:    xorl %ebx, %ebx
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm2
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movdqa %xmm0, (%esi)
+; X32-NEXT:    movl %ecx, (%edx)
+; X32-NEXT:    movl %ebx, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistr_mask_index_flag:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm2
+; X64-NEXT:    setb %al
+; X64-NEXT:    movdqa %xmm0, (%rdi)
+; X64-NEXT:    movl %ecx, (%rsi)
+; X64-NEXT:    movl %eax, (%rdx)
+; X64-NEXT:    retq
+entry:
+  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  store <16 x i8> %mask, <16 x i8>* %mptr
+  store i32 %index, i32* %iptr
+  store i32 %flag, i32* %fptr
+  ret void
+}
+
+; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
+define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_mask_index_flag_load:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movdqa %xmm0, %xmm1
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm2
+; X32-NEXT:    pcmpistrm $24, %xmm2, %xmm0
+; X32-NEXT:    xorl %ebx, %ebx
+; X32-NEXT:    pcmpistri $24, %xmm2, %xmm1
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movdqa %xmm0, (%esi)
+; X32-NEXT:    movl %ecx, (%edx)
+; X32-NEXT:    movl %ebx, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistr_mask_index_flag_load:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    movdqu (%rdi), %xmm2
+; X64-NEXT:    pcmpistrm $24, %xmm2, %xmm0
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    pcmpistri $24, %xmm2, %xmm1
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movdqa %xmm0, (%rsi)
+; X64-NEXT:    movl %ecx, (%rdx)
+; X64-NEXT:    movl %edi, (%rax)
+; X64-NEXT:    retq
+entry:
+  %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1
+  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  store <16 x i8> %mask, <16 x i8>* %mptr
+  store i32 %index, i32* %iptr
+  store i32 %flag, i32* %fptr
+  ret void
+}
+
+; Make sure we don't fold nontemporal loads.
+define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_nontemporal:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movntdqa (%ecx), %xmm1
+; X32-NEXT:    xorl %ebx, %ebx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_nontemporal:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movntdqa (%rsi), %xmm1
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    retq
+entry:
+  %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0
+  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  ret i32 %flag
+}
+
+!0 = !{ i32 1 }




More information about the llvm-commits mailing list