[PATCH] D48485: [InstCombine] allow shl+mul combos with shuffle (select) fold (PR37806)

Fri Jun 22 07:04:43 PDT 2018

spatel created this revision.
spatel added reviewers: RKSimon, lebedev.ri, efriedma.
Herald added a subscriber: mcrosier.

This is an enhancement to https://reviews.llvm.org/D48401 that was discussed in:
https://bugs.llvm.org/show_bug.cgi?id=37806

We can convert a shift-left-by-constant into a multiply (we canonicalize IR in the other direction because that's generally better of course). This allows us to remove the shuffle as we do in the regular opcodes-are-the-same cases.

This requires a small hack to make sure we don't mistakenly introduce any extra poison:
https://rise4fun.com/Alive/ZGv

The other examples of opcodes where this would work are add+sub and fadd+fsub, but we already canonicalize those subs into adds, so there's nothing to do for those cases AFAICT. Are there other opcode pairs where we can do this kind of transform?

Note that there's a different fold needed if we've already managed to simplify away a binop as seen in the test based on PR37806, but we manage to get that one case here because the fold is positioned above the demanded elements fold currently.


https://reviews.llvm.org/D48485

Files:
  lib/Transforms/InstCombine/InstCombineVectorOps.cpp
  test/Transforms/InstCombine/shuffle_select.ll


Index: test/Transforms/InstCombine/shuffle_select.ll
===================================================================

--- test/Transforms/InstCombine/shuffle_select.ll
+++ test/Transforms/InstCombine/shuffle_select.ll
@@ -239,27 +239,24 @@
   ret <4 x double> %t3
 }
 
-; FIXME:
 ; Shift-left with constant shift amount can be converted to mul to enable the fold.
 
 define <4 x i32> @mul_shl(<4 x i32> %v0) {
 ; CHECK-LABEL: @mul_shl(
-; CHECK-NEXT:    [[T1:%.*]] = mul nuw <4 x i32> [[V0:%.*]], <i32 undef, i32 undef, i32 3, i32 4>
-; CHECK-NEXT:    [[T2:%.*]] = shl nuw <4 x i32> [[V0]], <i32 5, i32 6, i32 7, i32 8>
-; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = mul nuw <4 x i32> [[V0:%.*]], <i32 32, i32 64, i32 3, i32 4>
 ; CHECK-NEXT:    ret <4 x i32> [[T3]]
 ;
   %t1 = mul nuw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
   %t2 = shl nuw <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
   %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   ret <4 x i32> %t3
 }
 
+; Try with shift as operand 0 of the shuffle; 'nsw' is dropped for safety, but that could be improved.
+
 define <4 x i32> @shl_mul(<4 x i32> %v0) {
 ; CHECK-LABEL: @shl_mul(
-; CHECK-NEXT:    [[T1:%.*]] = shl nsw <4 x i32> [[V0:%.*]], <i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[T2:%.*]] = mul nsw <4 x i32> [[V0]], <i32 5, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[T1]], <4 x i32> [[T2]], <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
+; CHECK-NEXT:    [[T3:%.*]] = shl <4 x i32> [[V0:%.*]], <i32 5, i32 undef, i32 8, i32 16>
 ; CHECK-NEXT:    ret <4 x i32> [[T3]]
 ;
   %t1 = shl nsw <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
@@ -273,16 +270,17 @@
 
 define <4 x i32> @mul_is_nop_shl(<4 x i32> %v0) {
 ; CHECK-LABEL: @mul_is_nop_shl(
-; CHECK-NEXT:    [[T2:%.*]] = shl <4 x i32> [[V0:%.*]], <i32 5, i32 6, i32 7, i32 8>
-; CHECK-NEXT:    [[T3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[T2]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[T3:%.*]] = shl <4 x i32> [[V0:%.*]], <i32 0, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    ret <4 x i32> [[T3]]
 ;
   %t1 = mul <4 x i32> %v0, <i32 1, i32 2, i32 3, i32 4>
   %t2 = shl <4 x i32> %v0, <i32 5, i32 6, i32 7, i32 8>
   %t3 = shufflevector <4 x i32> %t1, <4 x i32> %t2, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x i32> %t3
 }
 
+; Negative test: shift amount (operand 1) must be constant.
+
 define <4 x i32> @shl_mul_not_constant_shift_amount(<4 x i32> %v0) {
 ; CHECK-LABEL: @shl_mul_not_constant_shift_amount(
 ; CHECK-NEXT:    [[T1:%.*]] = shl <4 x i32> <i32 1, i32 2, i32 3, i32 4>, [[V0:%.*]]
Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1164,8 +1164,28 @@
   else
     return nullptr;
 
-  // TODO: There are potential folds where the opcodes do not match (mul+shl).
-  if (B0->getOpcode() != B1->getOpcode())
+  // We need matching binops to fold the lanes together.
+  BinaryOperator::BinaryOps Opcode0 = B0->getOpcode();
+  BinaryOperator::BinaryOps Opcode1 = B1->getOpcode();
+  bool DropNSW = false;
+  if (ConstantsAreOp1 && Opcode0 != Opcode1) {
+    // If we have multiply and shift-left-by-constant, convert the shift:
+    // shl X, C --> mul X, 1 << C
+    // TODO: We drop "nsw" if shift is converted into multiply because it may
+    // not be correct when the shift amount is BitWidth - 1. We could examine
+    // each vector element to determine if it is safe to keep that flag.
+    if (Opcode0 == Instruction::Mul && Opcode1 == Instruction::Shl) {
+      C1 = ConstantExpr::getShl(ConstantInt::get(C1->getType(), 1), C1);
+      Opcode1 = Instruction::Mul;
+      DropNSW = true;
+    } else if (Opcode0 == Instruction::Shl && Opcode1 == Instruction::Mul) {
+      C0 = ConstantExpr::getShl(ConstantInt::get(C0->getType(), 1), C0);
+      Opcode0 = Instruction::Mul;
+      DropNSW = true;
+    }
+  }
+
+  if (Opcode0 != Opcode1)
     return nullptr;
 
   // Remove a binop and the shuffle by rearranging the constant:
@@ -1186,6 +1206,8 @@
   // Flags are intersected from the 2 source binops.
   NewBO->copyIRFlags(B0);
   NewBO->andIRFlags(B1);
+  if (DropNSW)
+    NewBO->setHasNoSignedWrap(false);
   return NewBO;
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D48485.152470.patch
Type: text/x-patch
Size: 4494 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180622/73237e2d/attachment.bin>