[PATCH] D52059: [InstCombine][x86] try harder to convert blendv intrinsic to generic IR (PR38814)

Thu Sep 13 13:43:58 PDT 2018

spatel created this revision.
spatel added reviewers: RKSimon, craig.topper, andreadb.
Herald added a subscriber: mcrosier.

Missing optimizations with blendv are shown in:
https://bugs.llvm.org/show_bug.cgi?id=38814

If this works, it's an easier and more powerful solution than adding pattern matching for a few special cases in the backend. The potential danger with this transform in IR is that the condition value can get separated from the select, and the backend might not be able to make a blendv out of it again. I don't think that's too likely, but I've kept this patch minimal with a 'TODO', so we can test that theory in the wild before expanding the transform.


https://reviews.llvm.org/D52059

Files:
  lib/Transforms/InstCombine/InstCombineCalls.cpp
  test/Transforms/InstCombine/X86/blend_x86.ll


Index: test/Transforms/InstCombine/X86/blend_x86.ll
===================================================================

--- test/Transforms/InstCombine/X86/blend_x86.ll
+++ test/Transforms/InstCombine/X86/blend_x86.ll
@@ -157,9 +157,7 @@
 
 define <4 x float> @sel_v4f32(<4 x float> %x, <4 x float> %y, <4 x i1> %cond) {
 ; CHECK-LABEL: @sel_v4f32(
-; CHECK-NEXT:    [[S:%.*]] = sext <4 x i1> [[COND:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[S]] to <4 x float>
-; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[B]])
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = sext <4 x i1> %cond to <4 x i32>
@@ -170,17 +168,17 @@
 
 define <2 x double> @sel_v2f64(<2 x double> %x, <2 x double> %y, <2 x i1> %cond) {
 ; CHECK-LABEL: @sel_v2f64(
-; CHECK-NEXT:    [[S:%.*]] = sext <2 x i1> [[COND:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[S]] to <2 x double>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]], <2 x double> [[B]])
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[COND:%.*]], <2 x double> [[Y:%.*]], <2 x double> [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %s = sext <2 x i1> %cond to <2 x i64>
   %b = bitcast <2 x i64> %s to <2 x double>
   %r = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %x, <2 x double> %y, <2 x double> %b)
   ret <2 x double> %r
 }
 
+; TODO: We can bitcast X, Y, and the select and remove the intrinsic.
+
 define <16 x i8> @sel_v4i32(<16 x i8> %x, <16 x i8> %y, <4 x i1> %cond) {
 ; CHECK-LABEL: @sel_v4i32(
 ; CHECK-NEXT:    [[S:%.*]] = sext <4 x i1> [[COND:%.*]] to <4 x i32>
@@ -196,8 +194,7 @@
 
 define <16 x i8> @sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i1> %cond) {
 ; CHECK-LABEL: @sel_v16i8(
-; CHECK-NEXT:    [[S:%.*]] = sext <16 x i1> [[COND:%.*]] to <16 x i8>
-; CHECK-NEXT:    [[R:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[X:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[S]])
+; CHECK-NEXT:    [[R:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[X:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
 ;
   %s = sext <16 x i1> %cond to <16 x i8>
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2929,16 +2929,10 @@
   case Intrinsic::x86_avx_blendv_ps_256:
   case Intrinsic::x86_avx_blendv_pd_256:
   case Intrinsic::x86_avx2_pblendvb: {
-    // Convert blendv* to vector selects if the mask is constant.
-    // This optimization is convoluted because the intrinsic is defined as
-    // getting a vector of floats or doubles for the ps and pd versions.
-    // FIXME: That should be changed.
-
+    // fold (blend A, A, Mask) -> A
     Value *Op0 = II->getArgOperand(0);
     Value *Op1 = II->getArgOperand(1);
     Value *Mask = II->getArgOperand(2);
-
-    // fold (blend A, A, Mask) -> A
     if (Op0 == Op1)
       return replaceInstUsesWith(CI, Op0);
 
@@ -2951,6 +2945,20 @@
       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }
+
+    // Convert to a vector select if we can bypass casts and find a boolean
+    // vector condition value.
+    Value *BoolVec;
+    if (match(peekThroughBitcast(Mask), m_SExt(m_Value(BoolVec)))) {
+      auto *VTy = dyn_cast<VectorType>(BoolVec->getType());
+      if (VTy && VTy->getScalarSizeInBits() == 1 &&
+          VTy->getVectorNumElements() == II->getType()->getVectorNumElements())
+        return SelectInst::Create(BoolVec, Op1, Op0);
+      // TODO: If we can find a boolean vector condition with less elements,
+      //       then we can form a vector select by bitcasting Op0/Op1 to a
+      //       vector type with wider elements and bitcasting the result.
+    }
+
     break;
   }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D52059.165369.patch
Type: text/x-patch
Size: 4134 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180913/419a70ad/attachment.bin>