[llvm] 59b22e4 - [AggressiveInstCombine] Generalize foldGuardedRotateToFunnelShift to generic funnel shifts

Tue Nov 3 03:03:45 PST 2020

Author: Simon Pilgrim
Date: 2020-11-03T10:49:49Z
New Revision: 59b22e495c15d2830f41381a327f5d6bf49ff416

URL: https://github.com/llvm/llvm-project/commit/59b22e495c15d2830f41381a327f5d6bf49ff416
DIFF: https://github.com/llvm/llvm-project/commit/59b22e495c15d2830f41381a327f5d6bf49ff416.diff

LOG: [AggressiveInstCombine] Generalize foldGuardedRotateToFunnelShift to generic funnel shifts

The fold currently only handles rotation patterns, but with the maturation of backend funnel shift handling we can now realistically handle all funnel shift patterns.

This should allow us to begin resolving PR46896 et al.

Differential Revision: https://reviews.llvm.org/D90625

Added: 
    

Modified: 
    llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
    llvm/test/Transforms/AggressiveInstCombine/funnel.ll
    llvm/test/Transforms/AggressiveInstCombine/rotate.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index e7fb699d9fda..18dff9d8f51f 100644

--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -39,6 +39,8 @@ using namespace PatternMatch;
 STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
 STATISTIC(NumGuardedRotates,
           "Number of guarded rotates transformed into funnel shifts");
+STATISTIC(NumGuardedFunnelShifts,
+          "Number of guarded funnel shifts transformed into funnel shifts");
 STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
 
 namespace {
@@ -67,17 +69,17 @@ class AggressiveInstCombinerLegacyPass : public FunctionPass {
 };
 } // namespace
 
-/// Match a pattern for a bitwise rotate operation that partially guards
-/// against undefined behavior by branching around the rotation when the shift
-/// amount is 0.
-static bool foldGuardedRotateToFunnelShift(Instruction &I) {
+/// Match a pattern for a bitwise funnel/rotate operation that partially guards
+/// against undefined behavior by branching around the funnel-shift/rotation
+/// when the shift amount is 0.
+static bool foldGuardedFunnelShift(Instruction &I) {
   if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
     return false;
 
   // As with the one-use checks below, this is not strictly necessary, but we
   // are being cautious to avoid potential perf regressions on targets that
-  // do not actually have a rotate instruction (where the funnel shift would be
-  // expanded back into math/shift/logic ops).
+  // do not actually have a funnel/rotate instruction (where the funnel shift
+  // would be expanded back into math/shift/logic ops).
   if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
     return false;
 
@@ -111,27 +113,33 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) {
     return Intrinsic::not_intrinsic;
   };
 
-  // One phi operand must be a rotate operation, and the other phi operand must
-  // be the source value of that rotate operation:
+  // One phi operand must be a funnel/rotate operation, and the other phi
+  // operand must be the source value of that funnel/rotate operation:
   // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
+  // phi [ fshl(ShlVal0, ShlVal1, ShAmt), FunnelBB ], [ ShlVal0, GuardBB ]
+  // phi [ fshr(ShlVal0, ShlVal1, ShAmt), FunnelBB ], [ ShlVal1, GuardBB ]
   PHINode &Phi = cast<PHINode>(I);
   unsigned FunnelOp = 0, GuardOp = 1;
   Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
   Value *ShVal0, *ShVal1, *ShAmt;
   Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
-  if (IID == Intrinsic::not_intrinsic || ShVal0 != ShVal1 || ShVal0 != P1) {
+  if (IID == Intrinsic::not_intrinsic ||
+      (IID == Intrinsic::fshl && ShVal0 != P1) ||
+      (IID == Intrinsic::fshr && ShVal1 != P1)) {
     IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt);
-    if (IID == Intrinsic::not_intrinsic || ShVal0 != ShVal1 || ShVal0 != P0)
+    if (IID == Intrinsic::not_intrinsic ||
+        (IID == Intrinsic::fshl && ShVal0 != P0) ||
+        (IID == Intrinsic::fshr && ShVal1 != P0))
       return false;
     assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
            "Pattern must match funnel shift left or right");
     std::swap(FunnelOp, GuardOp);
   }
-  assert(ShVal0 == ShVal1 && "Rotation funnel shift pattern expected");
 
   // The incoming block with our source operand must be the "guard" block.
-  // That must contain a cmp+branch to avoid the rotate when the shift amount
-  // is equal to 0. The other incoming block is the block with the rotate.
+  // That must contain a cmp+branch to avoid the funnel/rotate when the shift
+  // amount is equal to 0. The other incoming block is the block with the
+  // funnel/rotate.
   BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
   BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
   Instruction *TermI = GuardBB->getTerminator();
@@ -150,18 +158,21 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) {
   //   br i1 %cmp, label %PhiBB, label %FunnelBB
   // FunnelBB:
   //   %sub = sub i32 32, %ShAmt
-  //   %shr = lshr i32 %RotSrc, %sub
-  //   %shl = shl i32 %RotSrc, %ShAmt
-  //   %rot = or i32 %shr, %shl
+  //   %shr = lshr i32 %ShVal1, %sub
+  //   %shl = shl i32 %ShVal0, %ShAmt
+  //   %fsh = or i32 %shr, %shl
   //   br label %PhiBB
   // PhiBB:
-  //   %cond = phi i32 [ %RotSrc, %FunnelBB ], [ %RotSrc, %GuardBB ]
+  //   %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
   // -->
-  // llvm.fshl.i32(i32 %RotSrc, i32 %RotSrc, i32 %ShAmt)
+  // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
   IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
   Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
   Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));
-  ++NumGuardedRotates;
+  if (ShVal0 == ShVal1)
+    ++NumGuardedRotates;
+  else
+    ++NumGuardedFunnelShifts;
   return true;
 }
 
@@ -350,7 +361,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
     // iteratively in this loop rather than waiting until the end.
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldAnyOrAllBitsSet(I);
-      MadeChange |= foldGuardedRotateToFunnelShift(I);
+      MadeChange |= foldGuardedFunnelShift(I);
       MadeChange |= tryToRecognizePopCount(I); 
     }
   }

diff  --git a/llvm/test/Transforms/AggressiveInstCombine/funnel.ll b/llvm/test/Transforms/AggressiveInstCombine/funnel.ll
index 545b8e175982..d48798dc7040 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/funnel.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/funnel.ll
@@ -7,14 +7,10 @@ define i32 @fshl(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[OR]], [[FSHBB]] ], [ [[A]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %c, 0
@@ -38,14 +34,10 @@ define i32 @fshl_commute_phi(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %c, 0
@@ -69,14 +61,10 @@ define i32 @fshl_commute_or(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %c, 0
@@ -102,15 +90,11 @@ define i32 @fshl_insert_valid_location(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[OR]], [[FSHBB]] ], [ [[A]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ 1, [[FSHBB]] ], [ 2, [[ENTRY]] ]
-; CHECK-NEXT:    [[RES:%.*]] = or i32 [[COND]], [[OTHER]]
+; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ 1, [[FSHBB]] ], [ 2, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[TMP0]], [[OTHER]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
@@ -137,14 +121,10 @@ define i32 @fshr(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[OR]], [[FSHBB]] ], [ [[B]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %c, 0
@@ -168,14 +148,10 @@ define i32 @fshr_commute_phi(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %c, 0
@@ -199,14 +175,10 @@ define i32 @fshr_commute_or(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %c, 0
@@ -396,7 +368,7 @@ end:
   ret i32 %cond
 }
 
-; Negative test - wrong shift.
+; Negative test - wrong shift for rotate (but can be folded to a generic funnel shift).
 
 define i32 @not_fshr_5(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: @not_fshr_5(
@@ -404,14 +376,10 @@ define i32 @not_fshr_5(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[C]], [[SUB]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[C]], i32 [[B:%.*]], i32 [[C]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %c, 0

diff  --git a/llvm/test/Transforms/AggressiveInstCombine/rotate.ll b/llvm/test/Transforms/AggressiveInstCombine/rotate.ll
index e47fa9be6f83..9e904e098219 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/rotate.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/rotate.ll
@@ -370,7 +370,7 @@ end:
   ret i32 %cond
 }
 
-; Negative test - wrong shift.
+; Negative test - wrong shift for rotate (but can be folded to a generic funnel shift).
 
 define i32 @not_rotr_5(i32 %a, i32 %b) {
 ; CHECK-LABEL: @not_rotr_5(
@@ -378,14 +378,10 @@ define i32 @not_rotr_5(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
 ; CHECK:       rotbb:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[B]], [[SUB]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[B]], i32 [[A:%.*]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
   %cmp = icmp eq i32 %b, 0