[llvm] r342806 - [InstCombine][x86] try even harder to convert blendv intrinsic to generic IR (PR38814)

Sat Sep 22 07:43:55 PDT 2018

Author: spatel
Date: Sat Sep 22 07:43:55 2018
New Revision: 342806

URL: http://llvm.org/viewvc/llvm-project?rev=342806&view=rev
Log:
[InstCombine][x86] try even harder to convert blendv intrinsic to generic IR (PR38814)

Follow-up to rL342324 (D52059):

Missing optimizations with blendv are shown in:
https://bugs.llvm.org/show_bug.cgi?id=38814

This is an easier and more powerful solution than adding pattern matching for a few 
special cases in the backend. The potential danger with this transform in IR is that 
the condition value can get separated from the select, and the backend might not be 
able to make a blendv out of it again.

Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=342806&r1=342805&r2=342806&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Sat Sep 22 07:43:55 2018
@@ -2949,14 +2949,27 @@ Instruction *InstCombiner::visitCallInst
     // Convert to a vector select if we can bypass casts and find a boolean
     // vector condition value.
     Value *BoolVec;
-    if (match(peekThroughBitcast(Mask), m_SExt(m_Value(BoolVec)))) {
-      auto *VTy = dyn_cast<VectorType>(BoolVec->getType());
-      if (VTy && VTy->getScalarSizeInBits() == 1 &&
-          VTy->getVectorNumElements() == II->getType()->getVectorNumElements())
+    Mask = peekThroughBitcast(Mask);
+    if (match(Mask, m_SExt(m_Value(BoolVec))) &&
+        BoolVec->getType()->isVectorTy() &&
+        BoolVec->getType()->getScalarSizeInBits() == 1) {
+      assert(Mask->getType()->getPrimitiveSizeInBits() ==
+             II->getType()->getPrimitiveSizeInBits() &&
+             "Not expecting mask and operands with different sizes");
+
+      unsigned NumMaskElts = Mask->getType()->getVectorNumElements();
+      unsigned NumOperandElts = II->getType()->getVectorNumElements();
+      if (NumMaskElts == NumOperandElts)
         return SelectInst::Create(BoolVec, Op1, Op0);
-      // TODO: If we can find a boolean vector condition with less elements,
-      //       then we can form a vector select by bitcasting Op0/Op1 to a
-      //       vector type with wider elements and bitcasting the result.
+
+      // If the mask has less elements than the operands, each mask bit maps to
+      // multiple elements of the operands. Bitcast back and forth.
+      if (NumMaskElts < NumOperandElts) {
+        Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType());
+        Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType());
+        Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
+        return new BitCastInst(Sel, II->getType());
+      }
     }
 
     break;

Modified: llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll?rev=342806&r1=342805&r2=342806&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll Sat Sep 22 07:43:55 2018
@@ -177,13 +177,14 @@ define <2 x double> @sel_v2f64(<2 x doub
   ret <2 x double> %r
 }
 
-; TODO: We can bitcast X, Y, and the select and remove the intrinsic.
+; Bitcast X, Y, and the select and remove the intrinsic.
 
 define <16 x i8> @sel_v4i32(<16 x i8> %x, <16 x i8> %y, <4 x i1> %cond) {
 ; CHECK-LABEL: @sel_v4i32(
-; CHECK-NEXT:    [[S:%.*]] = sext <4 x i1> [[COND:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[S]] to <16 x i8>
-; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[X:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[Y:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[TMP2]], <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
 ;
   %s = sext <4 x i1> %cond to <4 x i32>
@@ -238,19 +239,17 @@ define <2 x double> @sel_v2f64_sse_reali
   ret <2 x double> %r
 }
 
-; TODO: We can bitcast the inputs to the select and the result and remove the intrinsic.
+; Bitcast the inputs and the result and remove the intrinsic.
 
 define <2 x i64> @sel_v4i32_sse_reality(<2 x i64>* nocapture readonly %x, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: @sel_v4i32_sse_reality(
-; CHECK-NEXT:    [[XCAST:%.*]] = bitcast <2 x i64>* [[X:%.*]] to <16 x i8>*
-; CHECK-NEXT:    [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[XCAST]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64>* [[X:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[LD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
 ; CHECK-NEXT:    [[YCAST:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[ZCAST:%.*]] = bitcast <2 x i64> [[Z:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i32> [[YCAST]], [[ZCAST]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[COND:%.*]] = bitcast <4 x i32> [[SEXT]] to <16 x i8>
-; CHECK-NEXT:    [[R:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[LD]], <16 x i8> zeroinitializer, <16 x i8> [[COND]])
-; CHECK-NEXT:    [[RCAST:%.*]] = bitcast <16 x i8> [[R]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[CMP]], <4 x i32> zeroinitializer, <4 x i32> [[LD1]]
+; CHECK-NEXT:    [[RCAST:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[RCAST]]
 ;
   %xcast = bitcast <2 x i64>* %x to <16 x i8>*