[llvm] 93c9b39 - [X86] Fix MOVMSK(CONCAT(X, Y)) -> MOVMSK(AND/OR(X, Y)) fold for float types and demanded elements

Fri Jan 28 03:05:17 PST 2022

Author: Simon Pilgrim
Date: 2022-01-28T11:01:47Z
New Revision: 93c9b39d25ce842a911e800f68432668403b8aca

URL: https://github.com/llvm/llvm-project/commit/93c9b39d25ce842a911e800f68432668403b8aca
DIFF: https://github.com/llvm/llvm-project/commit/93c9b39d25ce842a911e800f68432668403b8aca.diff

LOG: [X86] Fix MOVMSK(CONCAT(X,Y)) -> MOVMSK(AND/OR(X,Y)) fold for float types and demanded elements

rG9103b73fe052 was assuming that we could OR/AND with the source vector, but that will fail on float/double vectors without bitcasting - it also missed the case that any_of checks might be testing less than all the source elements

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/combine-movmsk-avx.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aff72452af6c7..961c39832b627 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44500,14 +44500,16 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).
   // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).
   // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
-  if (VecVT.is256BitVector()) {
+  if (VecVT.is256BitVector() && NumElts <= CmpBits) {
     SmallVector<SDValue> Ops;
     if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
         Ops.size() == 2) {
       SDLoc DL(EFLAGS);
-      EVT SubVT = Ops[0].getValueType();
+      EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
       APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
-      SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, Ops);
+      SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
+                              DAG.getBitcast(SubVT, Ops[0]),
+                              DAG.getBitcast(SubVT, Ops[1]));
       V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

diff  --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
index 17d01e1d3362c..9e393a76a5b38 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
@@ -134,3 +134,36 @@ define i32 @movmskps_sext_v8i32(<8 x i16> %a0)  {
   %3 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %2)
   ret i32 %3
 }
+
+define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
+; CHECK-LABEL: movmskps_concat_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovmskps %xmm0, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    retq
+  %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %1)
+  %3 = icmp ne i32 %2, 0
+  %4 = sext i1 %3 to i32
+  ret i32 %4
+}
+
+define i32 @movmskps_demanded_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
+; CHECK-LABEL: movmskps_demanded_concat_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vmovmskps %ymm0, %eax
+; CHECK-NEXT:    andl $3, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %1)
+  %3 = and i32 %2, 3
+  %4 = icmp ne i32 %3, 0
+  %5 = sext i1 %4 to i32
+  ret i32 %5
+}