[llvm] bbf3925 - [InstCombine] matchFunnelShift - fold or(shl(a,x),lshr(b,sub(bw,x))) -> fshl(a,b,x) iff x < bw (REAPPLIED)

Mon Oct 12 08:08:35 PDT 2020

Author: Simon Pilgrim
Date: 2020-10-12T16:06:41+01:00
New Revision: bbf3925879b56aea42daeecc794bb41e99ebc126

URL: https://github.com/llvm/llvm-project/commit/bbf3925879b56aea42daeecc794bb41e99ebc126
DIFF: https://github.com/llvm/llvm-project/commit/bbf3925879b56aea42daeecc794bb41e99ebc126.diff

LOG: [InstCombine] matchFunnelShift - fold or(shl(a,x),lshr(b,sub(bw,x))) -> fshl(a,b,x) iff x < bw (REAPPLIED)

If value tracking can confirm that a shift value is less than the type bitwidth then we can more confidently fold general or(shl(a,x),lshr(b,sub(bw,x))) patterns to a funnel/rotate intrinsic pattern without causing bad codegen regressions in the backend (see D89139).

Reapplied after the shift canonicalization in rG02295e6d1a15 which removed the need to flip the shift values.

Differential Revision: https://reviews.llvm.org/D88783

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
    llvm/test/Transforms/InstCombine/funnel.ll
    llvm/test/Transforms/InstCombine/rotate.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index d2d2cecb9a65..fb415bf20d1a 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2053,7 +2053,7 @@ Instruction *InstCombinerImpl::matchBSwap(BinaryOperator &Or) {
 }
 
 /// Match UB-safe variants of the funnel shift intrinsic.
-static Instruction *matchFunnelShift(Instruction &Or) {
+static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
   // TODO: Can we reduce the code duplication between this and the related
   // rotate matching code under visitSelect and visitTrunc?
   unsigned Width = Or.getType()->getScalarSizeInBits();
@@ -2100,6 +2100,16 @@ static Instruction *matchFunnelShift(Instruction &Or) {
         return L;
     }
 
+    // (shl ShVal, X) | (lshr ShVal, (Width - x)) iff X < Width.
+    // We limit this to X < Width in case the backend re-expands the intrinsic,
+    // and has to reintroduce a shift modulo operation (InstCombine might remove
+    // it after this fold). This still doesn't guarantee that the final codegen
+    // will match this original pattern.
+    if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) {
+      KnownBits KnownL = IC.computeKnownBits(L, /*Depth*/ 0, &Or);
+      return KnownL.getMaxValue().ult(Width) ? L : nullptr;
+    }
+
     // For non-constant cases, the following patterns currently only work for
     // rotation patterns.
     // TODO: Add general funnel-shift compatible patterns.
@@ -2593,7 +2603,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *BSwap = matchBSwap(I))
     return BSwap;
 
-  if (Instruction *Funnel = matchFunnelShift(I))
+  if (Instruction *Funnel = matchFunnelShift(I, *this))
     return Funnel;
 
   if (Instruction *Concat = matchOrConcat(I, Builder))

diff  --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index d56bb7411910..7277681a1088 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -168,11 +168,7 @@ define <3 x i36> @fshl_v3i36_constant_nonsplat_undef0(<3 x i36> %x, <3 x i36> %y
 
 define i64 @fshl_sub_mask(i64 %x, i64 %y, i64 %a) {
 ; CHECK-LABEL: @fshl_sub_mask(
-; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[A:%.*]], 63
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[X:%.*]], [[MASK]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 64, [[MASK]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[Y:%.*]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or i64 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call i64 @llvm.fshl.i64(i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]])
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %mask = and i64 %a, 63
@@ -187,11 +183,7 @@ define i64 @fshl_sub_mask(i64 %x, i64 %y, i64 %a) {
 
 define i64 @fshr_sub_mask(i64 %x, i64 %y, i64 %a) {
 ; CHECK-LABEL: @fshr_sub_mask(
-; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[A:%.*]], 63
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[X:%.*]], [[MASK]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 64, [[MASK]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[Y:%.*]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or i64 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call i64 @llvm.fshr.i64(i64 [[Y:%.*]], i64 [[X:%.*]], i64 [[A:%.*]])
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %mask = and i64 %a, 63
@@ -204,11 +196,7 @@ define i64 @fshr_sub_mask(i64 %x, i64 %y, i64 %a) {
 
 define <2 x i64> @fshr_sub_mask_vector(<2 x i64> %x, <2 x i64> %y, <2 x i64> %a) {
 ; CHECK-LABEL: @fshr_sub_mask_vector(
-; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[A:%.*]], <i64 63, i64 63>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i64> [[X:%.*]], [[MASK]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw <2 x i64> <i64 64, i64 64>, [[MASK]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i64> [[Y:%.*]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i64> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> [[Y:%.*]], <2 x i64> [[X:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %mask = and <2 x i64> %a, <i64 63, i64 63>

diff  --git a/llvm/test/Transforms/InstCombine/rotate.ll b/llvm/test/Transforms/InstCombine/rotate.ll
index 667b5f087c8b..363750ea13a1 100644
--- a/llvm/test/Transforms/InstCombine/rotate.ll
+++ b/llvm/test/Transforms/InstCombine/rotate.ll
@@ -676,12 +676,8 @@ define i9 @rotateleft_9_neg_mask_wide_amount_commute(i9 %v, i33 %shamt) {
 
 define i64 @rotl_sub_mask(i64 %0, i64 %1) {
 ; CHECK-LABEL: @rotl_sub_mask(
-; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP1:%.*]], 63
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP0:%.*]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw i64 64, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP0]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    ret i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.fshl.i64(i64 [[TMP0:%.*]], i64 [[TMP0]], i64 [[TMP1:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
   %3 = and i64 %1, 63
   %4 = shl i64 %0, %3
@@ -695,12 +691,8 @@ define i64 @rotl_sub_mask(i64 %0, i64 %1) {
 
 define i64 @rotr_sub_mask(i64 %0, i64 %1) {
 ; CHECK-LABEL: @rotr_sub_mask(
-; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP1:%.*]], 63
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP0:%.*]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw i64 64, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP0]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    ret i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.fshr.i64(i64 [[TMP0:%.*]], i64 [[TMP0]], i64 [[TMP1:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
   %3 = and i64 %1, 63
   %4 = lshr i64 %0, %3
@@ -712,12 +704,8 @@ define i64 @rotr_sub_mask(i64 %0, i64 %1) {
 
 define <2 x i64> @rotr_sub_mask_vector(<2 x i64> %0, <2 x i64> %1) {
 ; CHECK-LABEL: @rotr_sub_mask_vector(
-; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP1:%.*]], <i64 63, i64 63>
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP0:%.*]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw <2 x i64> <i64 64, i64 64>, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP0]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <2 x i64> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> [[TMP0:%.*]], <2 x i64> [[TMP0]], <2 x i64> [[TMP1:%.*]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %3 = and <2 x i64> %1, <i64 63, i64 63>
   %4 = lshr <2 x i64> %0, %3