[llvm-commits] [llvm] r147601 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h test/CodeGen/X86/avx-brcond.ll test/CodeGen/X86/brcond.ll

Thu Jan 5 00:46:19 PST 2012

Author: vumansky
Date: Thu Jan  5 02:46:19 2012
New Revision: 147601

URL: http://llvm.org/viewvc/llvm-project?rev=147601&view=rev
Log:
Peephole optimization of ptest-conditioned branch in X86 arch. Performs instruction combining of sequences generated by ptestz/ptestc intrinsics to ptest+jcc pair for SSE and AVX.

Testing: passed 'make check' including LIT tests for all sequences being handled (both SSE and AVX)

Reviewers: Evan Cheng, David Blaikie, Bruno Lopes, Elena Demikhovsky, Chad Rosier, Anton Korobeynikov


Added:
    llvm/trunk/test/CodeGen/X86/avx-brcond.ll   (with props)
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/test/CodeGen/X86/brcond.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=147601&r1=147600&r2=147601&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Jan  5 02:46:19 2012
@@ -14611,6 +14611,146 @@
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
+// Helper which returns index of constant operand of a two-operand node.
+static inline int GetConstOpIndexFor2OpNode(SDValue Op) {
+  if (isa<ConstantSDNode>(Op.getOperand(0)))
+    return 0;
+  if (isa<ConstantSDNode>(Op.getOperand(1)))
+    return 1;
+  return -1;
+}
+
+SDValue X86TargetLowering::PerformBrcondCombine(SDNode* N, SelectionDAG &DAG,
+                                                DAGCombinerInfo &DCI) const {
+  // Simplification of the PTEST-and-BRANCH pattern.
+  //
+  // The LLVM IR patterns targeted are:
+  //     %res = call i32 @llvm.x86.<func>(...)
+  //     %one = icmp {ne|eq} i32 %res, {0|1}
+  //     br i1 %one, label %bb1, label %bb2
+  //                  and
+  //     %res = call i32 @llvm.x86.<func>(...)
+  //     %one = trunc i32 %res to i1
+  //     br i1 %one, label %bb1, label %bb2
+  //           where <func> is one of:
+  //             sse41.ptestz
+  //             sse41.ptestc
+  //             avx.ptestz.256
+  //             avx.ptestc.256
+  //
+  // The simplification is in folding of the following SDNode sequence:
+  //    X86ISD::PTEST
+  //    {X86ISD::SETCC | X86ISD::SETCC_CARRY}
+  //    [ISD::ZERO_EXTEND][[[ISD::AND,]ISD::TRUNCATE,]ISD::AND]
+  //    X86ISD::CMP
+  //    X86ISD::BRCOND(cond)
+  // to the code sequence:
+  //    X86ISD::PTEST
+  //    X86ISD::BRCOND(!cond)
+
+  // The optimization is relevant only once the DAG contains x86 ISA (i.e. after
+  // operation legalization).
+  if (DCI.isBeforeLegalize() || DCI.isBeforeLegalizeOps() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  // Below we iterate through DAG upwards, starting from BRCOND node and finishing
+  // at PTEST node. We stop the iteration once we cannot find match with any of 
+  // the patterns which we are able to simplify.
+
+  // Indices for constant and variable operands in two-operand nodes
+  int ConstOpIdx;
+  unsigned int VarOpIdx;
+
+  // Validate that we're starting from the BRCOND node.
+  assert(N->getOpcode() == X86ISD::BRCOND && "Should start from conditional branch!");
+  // Check that the BRCOND condition is ZF.
+  if (!isa<ConstantSDNode>(N->getOperand(2)))
+    return SDValue();
+  uint64_t BranchCond = N->getConstantOperandVal(2);
+  if (BranchCond != X86::COND_NE && BranchCond != X86::COND_E)
+    return SDValue();
+
+  // 1st step upwards: verify CMP use.
+  SDValue CmpValue = N->getOperand(3);
+  if (CmpValue.getOpcode() != X86ISD::CMP)
+    return SDValue();
+  // Check that the CMP comparison is with 0.
+  if ((ConstOpIdx = GetConstOpIndexFor2OpNode(CmpValue)) == -1)
+    return SDValue();
+  VarOpIdx = (ConstOpIdx == 0)? 1:0;
+  uint64_t CompareWith = CmpValue.getConstantOperandVal((unsigned int)ConstOpIdx);
+  if (CompareWith != 0 && CompareWith != 1)
+    return SDValue();
+
+  // 2rd step upwards: cover alternative paths between pre-BRCOND CMP and PTEST 
+  // return value analysis.
+  
+  SDValue SVOp = CmpValue.getOperand(VarOpIdx);
+  // Verify optional AND use.
+  if (SVOp.getOpcode() == ISD::AND) {
+    // Check that the AND is with 0x1.
+    if ((ConstOpIdx = GetConstOpIndexFor2OpNode(SVOp)) == -1)
+      return SDValue();
+    VarOpIdx = (ConstOpIdx == 0)? 1:0;
+    if (SVOp.getConstantOperandVal((unsigned int)ConstOpIdx) != 1)
+      return SDValue();
+    // Step upwards: verify optional TRUNCATE use.
+    SVOp = SVOp.getOperand(VarOpIdx);
+    if (SVOp.getOpcode() == ISD::TRUNCATE) {
+      // Step upwards: verify optional AND or ZERO_EXTEND use.
+      SVOp = SVOp.getOperand(0);
+      if (SVOp.getOpcode() == ISD::AND) {
+        // Check that the AND is with 0x1.
+        if ((ConstOpIdx = GetConstOpIndexFor2OpNode(SVOp)) == -1)
+          return SDValue();
+        VarOpIdx = (ConstOpIdx == 0)? 1:0;
+        if (SVOp.getConstantOperandVal((unsigned int)ConstOpIdx) != 1)
+            return SDValue();
+        // Step upwards.
+        SVOp = SVOp.getOperand(VarOpIdx);
+      }
+    }
+  }
+  // Verify optional ZERO_EXTEND use
+  if (SVOp.getOpcode() == ISD::ZERO_EXTEND) {
+    // Step upwards.
+    SVOp = SVOp.getOperand(0);
+  }
+
+  // 3rd step upwards: verify SETCC or SETCC_CARRY use.
+  unsigned SetCcOP = SVOp.getOpcode();
+  if (SetCcOP != X86ISD::SETCC && SetCcOP != X86ISD::SETCC_CARRY)
+    return SDValue();
+  // Check that the SETCC/SETCC_CARRY flag is 'COND_E' (for ptestz) or 'COND_B' (for ptestc)
+  if ((ConstOpIdx = GetConstOpIndexFor2OpNode(SVOp)) == -1)
+    return SDValue();
+  VarOpIdx = (ConstOpIdx == 0)? 1:0;
+  uint64_t SetCond = SVOp.getConstantOperandVal((unsigned int)ConstOpIdx);
+  if (SetCond != X86::COND_E && SetCond != X86::COND_B)
+    return SDValue();
+
+  // 4th step upwards: verify PTEST use.
+  SDValue PtestValue = SVOp.getOperand(VarOpIdx);
+  if (PtestValue.getOpcode() != X86ISD::PTEST)
+    return SDValue();
+
+  // The chain to be folded is recognized. We can fold it now.
+
+  // At first - select the branch condition.
+  SDValue CC = DAG.getConstant(SetCond, MVT::i8);
+  if ((CompareWith == 1 && BranchCond == X86::COND_NE) ||
+      (CompareWith == 0 && BranchCond == X86::COND_E)) {
+    // Invert branch condition.
+    CC = (SetCond == X86::COND_E? DAG.getConstant(X86::COND_NE, MVT::i8):
+                                   DAG.getConstant(X86::COND_AE, MVT::i8));
+  }
+  // Then - update the BRCOND node. 
+  // Resno is set to 0 as X86ISD::BRCOND has single return value.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
+                                        CC, PtestValue), 0);
+
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -14657,6 +14797,7 @@
   case X86ISD::VPERMILP:
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+  case X86ISD::BRCOND: return PerformBrcondCombine(N, DAG, DCI); 
   }
 
   return SDValue();

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=147601&r1=147600&r2=147601&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Thu Jan  5 02:46:19 2012
@@ -836,6 +836,7 @@
     SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue PerformBrcondCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const;
 
     // Utility functions to help LowerVECTOR_SHUFFLE
     SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const;

Added: llvm/trunk/test/CodeGen/X86/avx-brcond.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-brcond.ll?rev=147601&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-brcond.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx-brcond.ll Thu Jan  5 02:46:19 2012
@@ -0,0 +1,244 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+declare i32 @llvm.x86.avx.ptestz.256(<4 x i64> %p1, <4 x i64> %p2) nounwind
+declare i32 @llvm.x86.avx.ptestc.256(<4 x i64> %p1, <4 x i64> %p2) nounwind
+
+define <4 x float> @test1(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test1:
+; CHECK: vptest
+; CHECK-NEXT:	jne
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp ne i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test2(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test2:
+; CHECK: vptest
+; CHECK-NEXT:	je
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp eq i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test3(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test3:
+; CHECK: vptest
+; CHECK-NEXT:	jne
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = trunc i32 %res to i1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test4(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test4:
+; CHECK: vptest
+; CHECK-NEXT:	jae
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp ne i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test5(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test5:
+; CHECK: vptest
+; CHECK-NEXT:	jb
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp eq i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test6(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test6:
+; CHECK: vptest
+; CHECK-NEXT:	jae
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = trunc i32 %res to i1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test7(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test7:
+; CHECK: vptest
+; CHECK-NEXT:	jne
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp eq i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test8(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test8:
+; CHECK: vptest
+; CHECK-NEXT:	je
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp ne i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test9(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test9:
+; CHECK: vptest
+; CHECK-NEXT:	jae
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp eq i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test10(<4 x i64> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test10:
+; CHECK: vptest
+; CHECK-NEXT:	jb
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind 
+  %one = icmp ne i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}

Propchange: llvm/trunk/test/CodeGen/X86/avx-brcond.ll
------------------------------------------------------------------------------
    svn:executable = *

Modified: llvm/trunk/test/CodeGen/X86/brcond.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/brcond.ll?rev=147601&r1=147600&r2=147601&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/brcond.ll (original)
+++ llvm/trunk/test/CodeGen/X86/brcond.ll Thu Jan  5 02:46:19 2012
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=penryn | FileCheck %s
+
 ; rdar://7475489
 
 define i32 @test1(i32 %a, i32 %b) nounwind ssp {
@@ -106,3 +107,246 @@
   %.0 = fptrunc double %.0.in to float            ; <float> [#uses=1]
   ret float %.0
 }
+
+declare i32 @llvm.x86.sse41.ptestz(<4 x float> %p1, <4 x float> %p2) nounwind
+declare i32 @llvm.x86.sse41.ptestc(<4 x float> %p1, <4 x float> %p2) nounwind
+
+define <4 x float> @test5(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test5:
+; CHECK: ptest
+; CHECK-NEXT:	jne
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp ne i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test6(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test6:
+; CHECK: ptest
+; CHECK-NEXT:	je
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp eq i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test7(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test7:
+; CHECK: ptest
+; CHECK-NEXT:	jne
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = trunc i32 %res to i1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test8(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test8:
+; CHECK: ptest
+; CHECK-NEXT:	jae
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp ne i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test9(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test9:
+; CHECK: ptest
+; CHECK-NEXT:	jb
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp eq i32 %res, 0 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test10(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test10:
+; CHECK: ptest
+; CHECK-NEXT:	jae
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = trunc i32 %res to i1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test11(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test11:
+; CHECK: ptest
+; CHECK-NEXT:	jne
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp eq i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test12(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test12:
+; CHECK: ptest
+; CHECK-NEXT:	je
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp ne i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test13(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test13:
+; CHECK: ptest
+; CHECK-NEXT:	jae
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp eq i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}
+
+define <4 x float> @test14(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+; CHECK: test14:
+; CHECK: ptest
+; CHECK-NEXT:	jb
+; CHECK: ret
+
+  %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind 
+  %one = icmp ne i32 %res, 1 
+  br i1 %one, label %bb1, label %bb2
+
+bb1:
+  %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+  br label %return
+
+bb2:
+	%d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 >
+	br label %return
+
+return:
+  %e = phi <4 x float> [%c, %bb1], [%d, %bb2]
+  ret <4 x float> %e
+}