[llvm] r368506 - [X86] Match the IR pattern form movmsk on SSE1 only targets where v4i32 isn't legal

Sat Aug 10 00:51:13 PDT 2019

Author: ctopper
Date: Sat Aug 10 00:51:13 2019
New Revision: 368506

URL: http://llvm.org/viewvc/llvm-project?rev=368506&view=rev
Log:
[X86] Match the IR pattern form movmsk on SSE1 only targets where v4i32 isn't legal

Summary:
This patch adds a special DAG combine for SSE1 to recognize the IR pattern InstCombine gives us for movmsk. This only does the recognition for a few cases where its obvious the input won't be scalarized resulting in building a vector just do to the movmsk. I've made it separate from our existing matching for movmsk since that's called in multiple places and I didn't spend time to see if the other callers would make sense here. Plus the restrictions and additional checks would complicate that.

This fixes the case from PR42870. Buts its probably still broken the presence of logic ops feeding the movmsk pattern which would further hide the v4f32 type.

Reviewers: spatel, RKSimon, xbolva00

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65689

Added:
    llvm/trunk/test/CodeGen/X86/pr42870.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=368506&r1=368505&r2=368506&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Aug 10 00:51:13 2019
@@ -34820,6 +34820,24 @@ static SDValue combineBitcast(SDNode *N,
     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
       return V;
 
+    // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
+    // legalization destroys the v4i32 type.
+    if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
+        VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
+        N0.getOperand(0).getValueType() == MVT::v4i32 &&
+        ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
+        cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
+      SDValue N00 = N0.getOperand(0);
+      // Only do this if we can avoid scalarizing the input.
+      if (ISD::isNormalLoad(N00.getNode()) ||
+          (N00.getOpcode() == ISD::BITCAST &&
+           N00.getOperand(0).getValueType() == MVT::v4f32)) {
+        SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
+                                DAG.getBitcast(MVT::v4f32, N00));
+        return DAG.getZExtOrTrunc(V, dl, VT);
+      }
+    }
+
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
@@ -41775,7 +41793,8 @@ static SDValue combineSetCC(SDNode *N, S
 }
 
 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI) {
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
   SDValue Src = N->getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = N->getSimpleValueType(0);
@@ -41796,7 +41815,7 @@ static SDValue combineMOVMSK(SDNode *N,
 
   // Look through int->fp bitcasts that don't change the element width.
   unsigned EltWidth = SrcVT.getScalarSizeInBits();
-  if (Src.getOpcode() == ISD::BITCAST &&
+  if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
 
@@ -43759,7 +43778,7 @@ SDValue X86TargetLowering::PerformDAGCom
   case X86ISD::FMSUBADD_RND:
   case X86ISD::FMADDSUB:
   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, Subtarget);
-  case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI);
+  case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
   case X86ISD::MGATHER:
   case X86ISD::MSCATTER:
   case ISD::MGATHER:

Added: llvm/trunk/test/CodeGen/X86/pr42870.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr42870.ll?rev=368506&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr42870.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pr42870.ll Sat Aug 10 00:51:13 2019
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=sse | FileCheck %s
+
+define i32 @foo(<4 x float>* %a) {
+; CHECK-LABEL: foo:
+; CHECK:       ## %bb.0: ## %start
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm0
+; CHECK-NEXT:    movmskps %xmm0, %eax
+; CHECK-NEXT:    retl
+start:
+  %0 = bitcast <4 x float>* %a to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 16
+  %2 = icmp slt <4 x i32> %1, zeroinitializer
+  %3 = bitcast <4 x i1> %2 to i4
+  %4 = zext i4 %3 to i32
+  ret i32 %4
+}
+
+define i32 @bar(<4 x float> %a) {
+; CHECK-LABEL: bar:
+; CHECK:       ## %bb.0: ## %start
+; CHECK-NEXT:    movmskps %xmm0, %eax
+; CHECK-NEXT:    retl
+start:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = icmp slt <4 x i32> %0, zeroinitializer
+  %2 = bitcast <4 x i1> %1 to i4
+  %3 = zext i4 %2 to i32
+  ret i32 %3
+}