[PATCH] D52747: [InstCombine] reverse 'trunc X to <N x i1>' canonicalization

Mon Oct 1 14:02:59 PDT 2018

spatel created this revision.
spatel added reviewers: efriedma, craig.topper, RKSimon, lebedev.ri.
Herald added subscribers: kristof.beyls, mcrosier.
Herald added a reviewer: javed.absar.

icmp ne (and X, 1), 0 --> trunc X to N x i1

Ideally, I think we'd do the same for scalars, but I'm afraid of unintended consequences. 
The motivating vector case is from PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549

  define <4 x float> @bitwise_select(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
    %c = fcmp ole <4 x float> %x, %y
    %s = sext <4 x i1> %c to <4 x i32>
    %s1 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
    %s2 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
    %cond = or <4 x i32> %s1, %s2
    %condtr = trunc <4 x i32> %cond to <4 x i1>
    %r = select <4 x i1> %condtr, <4 x float> %z, <4 x float> %w
    ret <4 x float> %r
  }

Here's a sampling of the vector codegen for that case using mask+icmp (current behavior) vs. trunc (with this patch):

AVX before:

  vcmpleps	%xmm1, %xmm0, %xmm0
  vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
  vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
  vorps	%xmm0, %xmm1, %xmm0
  vandps	LCPI0_0(%rip), %xmm0, %xmm0
  vxorps	%xmm1, %xmm1, %xmm1
  vpcmpeqd	%xmm1, %xmm0, %xmm0
  vblendvps	%xmm0, %xmm3, %xmm2, %xmm0

AVX after:

  vcmpleps	%xmm1, %xmm0, %xmm0
  vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
  vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
  vorps	%xmm0, %xmm1, %xmm0
  vblendvps	%xmm0, %xmm2, %xmm3, %xmm0

AVX512f before:

  vcmpleps	%xmm1, %xmm0, %xmm0
  vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
  vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
  vorps	%xmm0, %xmm1, %xmm0
  vpbroadcastd	LCPI0_0(%rip), %xmm1 ## xmm1 = [1,1,1,1]
  vptestnmd	%zmm1, %zmm0, %k1
  vblendmps	%zmm3, %zmm2, %zmm0 {%k1}

AVX512f after:

  vcmpleps	%xmm1, %xmm0, %xmm0
  vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
  vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
  vorps	%xmm0, %xmm1, %xmm0
  vpslld	$31, %xmm0, %xmm0
  vptestmd	%zmm0, %zmm0, %k1
  vblendmps	%zmm2, %zmm3, %zmm0 {%k1}
   

AArch64 before:

  fcmge	v0.4s, v1.4s, v0.4s
  zip1	v1.4s, v0.4s, v0.4s
  zip2	v0.4s, v0.4s, v0.4s
  orr	v0.16b, v1.16b, v0.16b
  movi	v1.4s, #1
  and	v0.16b, v0.16b, v1.16b
  cmeq	v0.4s, v0.4s, #0
  bsl	v0.16b, v3.16b, v2.16b

AArch64 after:

  fcmge	v0.4s, v1.4s, v0.4s
  zip1	v1.4s, v0.4s, v0.4s
  zip2	v0.4s, v0.4s, v0.4s
  orr	v0.16b, v1.16b, v0.16b
  bsl	v0.16b, v2.16b, v3.16b

PowerPC-le before:

  xvcmpgesp 34, 35, 34
  vspltisw 0, 1
  vmrglw 3, 2, 2
  vmrghw 2, 2, 2
  xxlor 0, 35, 34
  xxlxor 35, 35, 35
  xxland 34, 0, 32
  vcmpequw 2, 2, 3
  xxsel 34, 36, 37, 34

PowerPC-le after:

  xvcmpgesp 34, 35, 34
  vmrglw 3, 2, 2
  vmrghw 2, 2, 2
  xxlor 0, 35, 34
  xxsel 34, 37, 36, 0


https://reviews.llvm.org/D52747

Files:
  lib/Transforms/InstCombine/InstCombineCasts.cpp
  lib/Transforms/InstCombine/InstCombineCompares.cpp
  test/Transforms/InstCombine/apint-shl-trunc.ll
  test/Transforms/InstCombine/vector-casts.ll


Index: test/Transforms/InstCombine/vector-casts.ll
===================================================================

--- test/Transforms/InstCombine/vector-casts.ll
+++ test/Transforms/InstCombine/vector-casts.ll
@@ -5,18 +5,16 @@
 
 define <2 x i1> @trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @trunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
-; CHECK-NEXT:    [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[T]]
 ;
   %t = trunc <2 x i64> %a to <2 x i1>
   ret <2 x i1> %t
 }
 
 define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
 ; CHECK-LABEL: @and_cmp_is_trunc(
-; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 1, i64 1>
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t = and <2 x i64> %a, <i64 1, i64 1>
Index: test/Transforms/InstCombine/apint-shl-trunc.ll
===================================================================
--- test/Transforms/InstCombine/apint-shl-trunc.ll
+++ test/Transforms/InstCombine/apint-shl-trunc.ll
@@ -27,9 +27,8 @@
 
 define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {
 ; CHECK-LABEL: @test0vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i39> <i39 1, i39 1>, [[A:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %B = lshr <2 x i39> %X, %A
Index: lib/Transforms/InstCombine/InstCombineCompares.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1708,6 +1708,12 @@
   if (!Cmp.isEquality())
     return nullptr;
 
+  // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
+  // TODO: We canonicalize to the longer form for scalars. Why?
+  if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() &&
+      match(Cmp.getOperand(1), m_Zero()) && match(And->getOperand(1), m_One()))
+    return CastInst::CreateTruncOrBitCast(And->getOperand(0), Cmp.getType());
+
   // X & -C == -C -> X >  u ~C
   // X & -C != -C -> X <= u ~C
   //   iff C is a power of 2
Index: lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -706,8 +706,9 @@
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
 
-  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
-  if (DestTy->getScalarSizeInBits() == 1) {
+  // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
+  // TODO: Why is using an icmp preferable to the minimal form?
+  if (DestTy->getPrimitiveSizeInBits() == 1) {
     Constant *One = ConstantInt::get(SrcTy, 1);
     Src = Builder.CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D52747.167820.patch
Type: text/x-patch
Size: 3278 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20181001/151c49e9/attachment.bin>