[llvm] d9ccb63 - [InstCombine] canonicalize shift+logic+shift to reduce dependency chain

Thu Nov 7 09:25:16 PST 2019

Author: Sanjay Patel
Date: 2019-11-07T12:09:45-05:00
New Revision: d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05

URL: https://github.com/llvm/llvm-project/commit/d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05
DIFF: https://github.com/llvm/llvm-project/commit/d9ccb6367a1089fd61bd85be6b0fbb0d6a590e05.diff

LOG: [InstCombine] canonicalize shift+logic+shift to reduce dependency chain

shift (logic (shift X, C0), Y), C1 --> logic (shift X, C0+C1), (shift Y, C1)

This is an IR translation of an existing SDAG transform added here:
rL370617

So we again have 9 possible patterns with a commuted IR variant of each pattern:
https://rise4fun.com/Alive/VlI
https://rise4fun.com/Alive/n1m
https://rise4fun.com/Alive/1Vn

Part of the motivation is to allow easier recognition and subsequent
canonicalization of bswap patterns as discussed in PR43146:
https://bugs.llvm.org/show_bug.cgi?id=43146

We had to delay this transform because it used to allow the SLP vectorizer
to create awful reductions out of simple load-combines.
That problem was fixed with:
rL375025
(we'll bring back load combining in IR someday...)

The backend is also better equipped to deal with these patterns now
using hooks like TLI.getShiftAmountThreshold().

The only remaining potential controversy is that the -reassociate pass
tends to reverse this kind of pattern (to help GVN?). But since -reassociate
doesn't do anything with these specific patterns, there is no conflict currently.

Finally, there's a new pass proposal at D67383 for general tree-height-reduction
reassociation, and it could use a cost model to decide how to optimally rearrange
these kinds of ops for a target. That patch appears to be stalled.

Differential Revision: https://reviews.llvm.org/D69842

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
    llvm/test/Transforms/InstCombine/bswap.ll
    llvm/test/Transforms/InstCombine/shift-logic.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index eea890c815ca..f35a6ce2d937 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -296,6 +296,49 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
   return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
 }
 
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This eliminates a
+/// a use of an intermediate value (reduces dependency chain).
+static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
+                                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isShift() && "Expected a shift as input");
+  auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0));
+  if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
+    return nullptr;
+
+  const APInt *C0, *C1;
+  if (!match(I.getOperand(1), m_APInt(C1)))
+    return nullptr;
+
+  Instruction::BinaryOps ShiftOpcode = I.getOpcode();
+  Type *Ty = I.getType();
+
+  // Find a matching one-use shift by constant. The fold is not valid if the sum
+  // of the shift values equals or exceeds bitwidth.
+  // TODO: Remove the one-use check if the other logic operand (Y) is constant.
+  Value *X, *Y;
+  auto matchFirstShift = [&](Value *V) {
+    return match(V, m_OneUse(m_Shift(m_Value(X), m_APInt(C0)))) &&
+           cast<BinaryOperator>(V)->getOpcode() == ShiftOpcode &&
+           (*C0 + *C1).ult(Ty->getScalarSizeInBits());
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  if (matchFirstShift(LogicInst->getOperand(0)))
+    Y = LogicInst->getOperand(1);
+  else if (matchFirstShift(LogicInst->getOperand(1)))
+    Y = LogicInst->getOperand(0);
+  else
+    return nullptr;
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  Constant *ShiftSumC = ConstantInt::get(Ty, *C0 + *C1);
+  Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
+  Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
+  return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
+}
+
 Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
@@ -348,6 +391,9 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     return &I;
   }
 
+  if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
+    return Logic;
+
   return nullptr;
 }
 

diff  --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll
index 2d8a69b86a59..540b1752c266 100644
--- a/llvm/test/Transforms/InstCombine/bswap.ll
+++ b/llvm/test/Transforms/InstCombine/bswap.ll
@@ -169,10 +169,10 @@ define i32 @bswap32_shl_first(i32 %x) {
 
 define i32 @bswap32_shl_first_extra_use(i32 %x) {
 ; CHECK-LABEL: @bswap32_shl_first_extra_use(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 16
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT:    [[SWAPHALF:%.*]] = or i32 [[SHL]], [[SHR]]
-; CHECK-NEXT:    [[T:%.*]] = shl i32 [[SWAPHALF]], 8
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[SHR]], 8
+; CHECK-NEXT:    [[T:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]])
 ; CHECK-NEXT:    call void @extra_use(i32 [[T]])
 ; CHECK-NEXT:    ret i32 [[BSWAP]]

diff  --git a/llvm/test/Transforms/InstCombine/shift-logic.ll b/llvm/test/Transforms/InstCombine/shift-logic.ll
index f387d1f738be..453463f46fc4 100644
--- a/llvm/test/Transforms/InstCombine/shift-logic.ll
+++ b/llvm/test/Transforms/InstCombine/shift-logic.ll
@@ -3,9 +3,9 @@
 
 define i8 @shl_and(i8 %x, i8 %y) {
 ; CHECK-LABEL: @shl_and(
-; CHECK-NEXT:    [[SH0:%.*]] = shl i8 [[X:%.*]], 3
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i8 [[R]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SH1:%.*]] = and i8 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i8 [[SH1]]
 ;
   %sh0 = shl i8 %x, 3
@@ -17,9 +17,9 @@ define i8 @shl_and(i8 %x, i8 %y) {
 define i16 @shl_or(i16 %x, i16 %py) {
 ; CHECK-LABEL: @shl_or(
 ; CHECK-NEXT:    [[Y:%.*]] = srem i16 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = shl i16 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = or i16 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i16 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i16 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i16 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = or i16 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i16 [[SH1]]
 ;
   %y = srem i16 %py, 42 ; thwart complexity-based canonicalization
@@ -31,9 +31,9 @@ define i16 @shl_or(i16 %x, i16 %py) {
 
 define i32 @shl_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @shl_xor(
-; CHECK-NEXT:    [[SH0:%.*]] = shl i32 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = xor i32 [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = shl i32 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[Y:%.*]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[SH1]]
 ;
   %sh0 = shl i32 %x, 5
@@ -45,9 +45,9 @@ define i32 @shl_xor(i32 %x, i32 %y) {
 define i64 @lshr_and(i64 %x, i64 %py) {
 ; CHECK-LABEL: @lshr_and(
 ; CHECK-NEXT:    [[Y:%.*]] = srem i64 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = lshr i64 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = and i64 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr i64 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = and i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i64 [[SH1]]
 ;
   %y = srem i64 %py, 42 ; thwart complexity-based canonicalization
@@ -59,9 +59,9 @@ define i64 @lshr_and(i64 %x, i64 %py) {
 
 define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @lshr_or(
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <4 x i32> [[X:%.*]], <i32 5, i32 5, i32 5, i32 5>
-; CHECK-NEXT:    [[R:%.*]] = or <4 x i32> [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <4 x i32> [[R]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[X:%.*]], <i32 12, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[Y:%.*]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[SH1:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[SH1]]
 ;
   %sh0 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -73,9 +73,9 @@ define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) {
 define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %py) {
 ; CHECK-LABEL: @lshr_xor(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <8 x i16> [[PY:%.*]], <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <8 x i16> [[X:%.*]], <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-; CHECK-NEXT:    [[R:%.*]] = xor <8 x i16> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <8 x i16> [[R]], <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i16> [[X:%.*]], <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[Y]], <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+; CHECK-NEXT:    [[SH1:%.*]] = xor <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x i16> [[SH1]]
 ;
   %y = srem <8 x i16> %py, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 -42> ; thwart complexity-based canonicalization
@@ -89,9 +89,9 @@ define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %py) {
 define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %py, <16 x i8> %pz) {
 ; CHECK-LABEL: @ashr_and(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <16 x i8> [[PY:%.*]], [[PZ:%.*]]
-; CHECK-NEXT:    [[SH0:%.*]] = ashr <16 x i8> [[X:%.*]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[R:%.*]] = and <16 x i8> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr <16 x i8> [[R]], <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i8> [[X:%.*]], <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <16 x i8> [[Y]], <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    [[SH1:%.*]] = and <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <16 x i8> [[SH1]]
 ;
   %y = srem <16 x i8> %py, %pz ; thwart complexity-based canonicalization
@@ -103,9 +103,9 @@ define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %py, <16 x i8> %pz) {
 
 define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @ashr_or(
-; CHECK-NEXT:    [[SH0:%.*]] = ashr <2 x i64> [[X:%.*]], <i64 5, i64 5>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i64> [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr <2 x i64> [[R]], <i64 7, i64 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i64> [[X:%.*]], <i64 12, i64 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i64> [[Y:%.*]], <i64 7, i64 7>
+; CHECK-NEXT:    [[SH1:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
 ;
   %sh0 = ashr <2 x i64> %x, <i64 5, i64 5>
@@ -117,9 +117,9 @@ define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) {
 define i32 @ashr_xor(i32 %x, i32 %py) {
 ; CHECK-LABEL: @ashr_xor(
 ; CHECK-NEXT:    [[Y:%.*]] = srem i32 [[PY:%.*]], 42
-; CHECK-NEXT:    [[SH0:%.*]] = ashr i32 [[X:%.*]], 5
-; CHECK-NEXT:    [[R:%.*]] = xor i32 [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr i32 [[R]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y]], 7
+; CHECK-NEXT:    [[SH1:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[SH1]]
 ;
   %y = srem i32 %py, 42 ; thwart complexity-based canonicalization