[llvm] r332743 - [X86] Directly legalize v16i16/v8i16 vselect to vXi8 vselect to use VPBLENDVB

Fri May 18 10:48:06 PDT 2018

Author: ctopper
Date: Fri May 18 10:48:06 2018
New Revision: 332743

URL: http://llvm.org/viewvc/llvm-project?rev=332743&view=rev
Log:
[X86] Directly legalize v16i16/v8i16 vselect to vXi8 vselect to use VPBLENDVB

The intrinsic legalization for masked truncate uses ISD::TRUNCATE which can be constant folded by getNode. This prevents getVectorMaskingNode from seeing the ISD::TRUNCATE special case where it should emit X86ISD::SELECT instead of ISD::VSELECT. This causes a vselect with a v16i1 or v8i1 condition to be emitted during vector legalization. but vector legalization doesn't revisit nodes it creates. DAG combine will then promote this condition to match the result type. Then op legalization will try to legalize it, but the custom lowering hook returned SDValue(). But op legalization doesn't have an Expand for VSELECT because it expects vector legalization to have taken care of it. So the operation sticks around and fails in isel.

This patch adds a custom legalization hook to morph it to a vXi8 vselect instead.

This also simplifies the normal vXi16 vselect handling because vector legalization was normally expanding to AND/ANDN/OR and DAG combine was turning that into VBLENDVB. So we can skip a step by doing it directly.

Fixes PR37499

Differential Revision: https://reviews.llvm.org/D47025

Added:
    llvm/trunk/test/CodeGen/X86/pr37499.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=332743&r1=332742&r2=332743&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri May 18 10:48:06 2018
@@ -15032,10 +15032,15 @@ SDValue X86TargetLowering::LowerVSELECT(
     return SDValue();
 
   case MVT::v8i16:
-  case MVT::v16i16:
-    // FIXME: We should custom lower this by fixing the condition and using i8
-    // blends.
-    return SDValue();
+  case MVT::v16i16: {
+    // Bitcast everything to the vXi8 type and use a vXi8 vselect.
+    MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+    SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
+    SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
+    SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
+    SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
+    return DAG.getBitcast(VT, Select);
+  }
   }
 }
 

Added: llvm/trunk/test/CodeGen/X86/pr37499.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr37499.ll?rev=332743&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr37499.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pr37499.ll Fri May 18 10:48:06 2018
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s
+
+define <2 x i64> @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vmovdqa32 %ymm1, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
+; CHECK-NEXT:    vpblendvb %xmm1, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> undef, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i8 1) #3
+  %2 = bitcast <8 x i16> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @goo() {
+; CHECK-LABEL: goo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    movw $1, %ax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vpmovdw %zmm1, %ymm1
+; CHECK-NEXT:    vpblendvb %ymm1, %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = tail call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> undef, <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i16 1) #3
+  %2 = bitcast <16 x i16> %1 to <4 x i64>
+  ret <4 x i64> %2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
+declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)