[clang-tools-extra] [InstCombine] Convert or concat to fshl if opposite or concat exists (PR #68502)

Mon Oct 23 17:35:15 PDT 2023

https://github.com/HaohaiWen updated https://github.com/llvm/llvm-project/pull/68502

>From 5b3b1bbb5b263bc5711adde031d85b1461ccbab6 Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Sat, 7 Oct 2023 13:48:32 +0800
Subject: [PATCH 1/5] [InstCombine] Refactor matchFunnelShift to allow more
 pattern (NFC)

Current implementation of matchFunnelShift only allows opposite shift
pattern. Refactor it to allow more pattern.
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 172 ++++++++++--------
 1 file changed, 93 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index cbdab3e9c5fb91d..b04e070fd19d7d1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2732,100 +2732,114 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
   // rotate matching code under visitSelect and visitTrunc?
   unsigned Width = Or.getType()->getScalarSizeInBits();
 
-  // First, find an or'd pair of opposite shifts:
-  // or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1)
-  BinaryOperator *Or0, *Or1;
-  if (!match(Or.getOperand(0), m_BinOp(Or0)) ||
-      !match(Or.getOperand(1), m_BinOp(Or1)))
-    return nullptr;
-
-  Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
-  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
-      !match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
-      Or0->getOpcode() == Or1->getOpcode())
+  Instruction *Or0, *Or1;
+  if (!match(Or.getOperand(0), m_Instruction(Or0)) ||
+      !match(Or.getOperand(1), m_Instruction(Or1)))
     return nullptr;
 
-  // Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
-  if (Or0->getOpcode() == BinaryOperator::LShr) {
-    std::swap(Or0, Or1);
-    std::swap(ShVal0, ShVal1);
-    std::swap(ShAmt0, ShAmt1);
-  }
-  assert(Or0->getOpcode() == BinaryOperator::Shl &&
-         Or1->getOpcode() == BinaryOperator::LShr &&
-         "Illegal or(shift,shift) pair");
-
-  // Match the shift amount operands for a funnel shift pattern. This always
-  // matches a subtraction on the R operand.
-  auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
-    // Check for constant shift amounts that sum to the bitwidth.
-    const APInt *LI, *RI;
-    if (match(L, m_APIntAllowUndef(LI)) && match(R, m_APIntAllowUndef(RI)))
-      if (LI->ult(Width) && RI->ult(Width) && (*LI + *RI) == Width)
-        return ConstantInt::get(L->getType(), *LI);
-
-    Constant *LC, *RC;
-    if (match(L, m_Constant(LC)) && match(R, m_Constant(RC)) &&
-        match(L, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width))) &&
-        match(R, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width))) &&
-        match(ConstantExpr::getAdd(LC, RC), m_SpecificIntAllowUndef(Width)))
-      return ConstantExpr::mergeUndefsWith(LC, RC);
-
-    // (shl ShVal, X) | (lshr ShVal, (Width - x)) iff X < Width.
-    // We limit this to X < Width in case the backend re-expands the intrinsic,
-    // and has to reintroduce a shift modulo operation (InstCombine might remove
-    // it after this fold). This still doesn't guarantee that the final codegen
-    // will match this original pattern.
-    if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) {
-      KnownBits KnownL = IC.computeKnownBits(L, /*Depth*/ 0, &Or);
-      return KnownL.getMaxValue().ult(Width) ? L : nullptr;
-    }
+  bool IsFshl = true; // Sub on LSHR.
+  SmallVector<Value *, 3> FShiftArgs;
 
-    // For non-constant cases, the following patterns currently only work for
-    // rotation patterns.
-    // TODO: Add general funnel-shift compatible patterns.
-    if (ShVal0 != ShVal1)
+  // First, find an or'd pair of opposite shifts:
+  // or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1)
+  if (isa<BinaryOperator>(Or0) && isa<BinaryOperator>(Or1)) {
+    Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
+    if (!match(Or0,
+               m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
+        !match(Or1,
+               m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
+        Or0->getOpcode() == Or1->getOpcode())
       return nullptr;
 
-    // For non-constant cases we don't support non-pow2 shift masks.
-    // TODO: Is it worth matching urem as well?
-    if (!isPowerOf2_32(Width))
-      return nullptr;
+    // Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
+    if (Or0->getOpcode() == BinaryOperator::LShr) {
+      std::swap(Or0, Or1);
+      std::swap(ShVal0, ShVal1);
+      std::swap(ShAmt0, ShAmt1);
+    }
+    assert(Or0->getOpcode() == BinaryOperator::Shl &&
+           Or1->getOpcode() == BinaryOperator::LShr &&
+           "Illegal or(shift,shift) pair");
+
+    // Match the shift amount operands for a funnel shift pattern. This always
+    // matches a subtraction on the R operand.
+    auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
+      // Check for constant shift amounts that sum to the bitwidth.
+      const APInt *LI, *RI;
+      if (match(L, m_APIntAllowUndef(LI)) && match(R, m_APIntAllowUndef(RI)))
+        if (LI->ult(Width) && RI->ult(Width) && (*LI + *RI) == Width)
+          return ConstantInt::get(L->getType(), *LI);
+
+      Constant *LC, *RC;
+      if (match(L, m_Constant(LC)) && match(R, m_Constant(RC)) &&
+          match(L,
+                m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width))) &&
+          match(R,
+                m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width))) &&
+          match(ConstantExpr::getAdd(LC, RC), m_SpecificIntAllowUndef(Width)))
+        return ConstantExpr::mergeUndefsWith(LC, RC);
+
+      // (shl ShVal, X) | (lshr ShVal, (Width - x)) iff X < Width.
+      // We limit this to X < Width in case the backend re-expands the
+      // intrinsic, and has to reintroduce a shift modulo operation (InstCombine
+      // might remove it after this fold). This still doesn't guarantee that the
+      // final codegen will match this original pattern.
+      if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) {
+        KnownBits KnownL = IC.computeKnownBits(L, /*Depth*/ 0, &Or);
+        return KnownL.getMaxValue().ult(Width) ? L : nullptr;
+      }
 
-    // The shift amount may be masked with negation:
-    // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
-    Value *X;
-    unsigned Mask = Width - 1;
-    if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
-        match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
-      return X;
+      // For non-constant cases, the following patterns currently only work for
+      // rotation patterns.
+      // TODO: Add general funnel-shift compatible patterns.
+      if (ShVal0 != ShVal1)
+        return nullptr;
 
-    // Similar to above, but the shift amount may be extended after masking,
-    // so return the extended value as the parameter for the intrinsic.
-    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
-        match(R, m_And(m_Neg(m_ZExt(m_And(m_Specific(X), m_SpecificInt(Mask)))),
-                       m_SpecificInt(Mask))))
-      return L;
+      // For non-constant cases we don't support non-pow2 shift masks.
+      // TODO: Is it worth matching urem as well?
+      if (!isPowerOf2_32(Width))
+        return nullptr;
 
-    if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
-        match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
-      return L;
+      // The shift amount may be masked with negation:
+      // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
+      Value *X;
+      unsigned Mask = Width - 1;
+      if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
+          match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
+        return X;
+
+      // Similar to above, but the shift amount may be extended after masking,
+      // so return the extended value as the parameter for the intrinsic.
+      if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+          match(R,
+                m_And(m_Neg(m_ZExt(m_And(m_Specific(X), m_SpecificInt(Mask)))),
+                      m_SpecificInt(Mask))))
+        return L;
+
+      if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+          match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
+        return L;
 
-    return nullptr;
-  };
+      return nullptr;
+    };
 
-  Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width);
-  bool IsFshl = true; // Sub on LSHR.
-  if (!ShAmt) {
-    ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width);
-    IsFshl = false; // Sub on SHL.
+    Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width);
+    if (!ShAmt) {
+      ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width);
+      IsFshl = false; // Sub on SHL.
+    }
+    if (!ShAmt)
+      return nullptr;
+
+    FShiftArgs = {ShVal0, ShVal1, ShAmt};
   }
-  if (!ShAmt)
+
+  if (FShiftArgs.empty())
     return nullptr;
 
   Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
   Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType());
-  return CallInst::Create(F, {ShVal0, ShVal1, ShAmt});
+  return CallInst::Create(F, FShiftArgs);
 }
 
 /// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns.

>From 68ab662ac8666648f1838a18b06f05d3680604d3 Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Sat, 7 Oct 2023 16:50:22 +0800
Subject: [PATCH 2/5] [InstCombine] Convert or concat to fshl if opposite or
 concat exists

If there are two 'or' instructions concat variables in opposite order
and the first 'or' dominates the second one, the second 'or' can be
optimized to fshl to rotate shift first 'or'. This can eliminate an shl
and expose more optimization opportunity for bswap/bitreverse.
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 46 ++++++++++++++++++-
 llvm/test/Transforms/InstCombine/funnel.ll    | 42 +++++++++++++++++
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b04e070fd19d7d1..09017e46ee3bd4d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2727,7 +2727,8 @@ Instruction *InstCombinerImpl::matchBSwapOrBitReverse(Instruction &I,
 }
 
 /// Match UB-safe variants of the funnel shift intrinsic.
-static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
+static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC,
+                                     const DominatorTree &DT) {
   // TODO: Can we reduce the code duplication between this and the related
   // rotate matching code under visitSelect and visitTrunc?
   unsigned Width = Or.getType()->getScalarSizeInBits();
@@ -2832,6 +2833,47 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
       return nullptr;
 
     FShiftArgs = {ShVal0, ShVal1, ShAmt};
+
+  } else if (isa<ZExtInst>(Or0) || isa<ZExtInst>(Or1)) {
+    // If there are two 'or' instructions concat variables in opposite order,
+    // the latter one can be safely convert to fshl.
+    //
+    // LowHigh = or (shl (zext Low), Width - ZextHighShlAmt), (zext High)
+    // HighLow = or (shl (zext High), ZextHighShlAmt), (zext Low)
+    // ->
+    // HighLow = fshl LowHigh, LowHigh, ZextHighShlAmt
+    if (!isa<ZExtInst>(Or1))
+      std::swap(Or0, Or1);
+
+    Value *High, *ZextHigh, *Low;
+    const APInt *ZextHighShlAmt;
+    if (!match(Or0,
+               m_OneUse(m_Shl(m_Value(ZextHigh), m_APInt(ZextHighShlAmt)))))
+      return nullptr;
+
+    if (!match(Or1, m_ZExt(m_Value(Low))) ||
+        !match(ZextHigh, m_ZExt(m_Value(High))))
+      return nullptr;
+
+    unsigned HighSize = High->getType()->getScalarSizeInBits();
+    unsigned LowSize = Low->getType()->getScalarSizeInBits();
+    if (*ZextHighShlAmt != LowSize || HighSize + LowSize != Width)
+      return nullptr;
+
+    for (User *U : ZextHigh->users()) {
+      Value *X, *Y;
+      if (!match(U, m_Or(m_Value(X), m_Value(Y))))
+        continue;
+
+      if (!isa<ZExtInst>(Y))
+        std::swap(X, Y);
+
+      if (match(X, m_Shl(m_Specific(Or1), m_SpecificInt(HighSize))) &&
+          match(Y, m_Specific(ZextHigh)) && DT.dominates(U, &Or)) {
+        FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)};
+        break;
+      }
+    }
   }
 
   if (FShiftArgs.empty())
@@ -3333,7 +3375,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
                                                   /*MatchBitReversals*/ true))
     return BitOp;
 
-  if (Instruction *Funnel = matchFunnelShift(I, *this))
+  if (Instruction *Funnel = matchFunnelShift(I, *this, DT))
     return Funnel;
 
   if (Instruction *Concat = matchOrConcat(I, Builder))
diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index 60ce49a1635623f..c5bd1aa7b4351bc 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -354,6 +354,48 @@ define <2 x i64> @fshl_select_vector(<2 x i64> %x, <2 x i64> %y, <2 x i64> %sham
   ret <2 x i64> %r
 }
 
+; Convert 'or concat' to fshl if opposite 'or concat' exists.
+
+define i32 @fshl_concat(i8 %x, i24 %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 24
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i24 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
+; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 8)
+; CHECK-NEXT:    ret i32 [[YX]]
+;
+  %zext.x = zext i8 %x to i32
+  %slx = shl nuw i32 %zext.x, 24
+  %zext.y = zext i24 %y to i32
+  %xy = or i32 %zext.y, %slx
+  store i32 %xy, ptr %addr, align 4
+  %sly = shl nuw i32 %zext.y, 8
+  %yx = or i32 %zext.x, %sly
+  ret i32 %yx
+}
+
+define <2 x i32> @fshl_concat_vector(<2 x i8> %x, <2 x i24> %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_vector(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw <2 x i32> [[ZEXT_X]], <i32 24, i32 24>
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext <2 x i24> [[Y:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[XY:%.*]] = or <2 x i32> [[SLX]], [[ZEXT_Y]]
+; CHECK-NEXT:    store <2 x i32> [[XY]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[YX:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[XY]], <2 x i32> [[XY]], <2 x i32> <i32 8, i32 8>)
+; CHECK-NEXT:    ret <2 x i32> [[YX]]
+;
+  %zext.x = zext <2 x i8> %x to <2 x i32>
+  %slx = shl nuw <2 x i32> %zext.x, <i32 24, i32 24>
+  %zext.y = zext <2 x i24> %y to <2 x i32>
+  %xy = or <2 x i32> %slx, %zext.y
+  store <2 x i32> %xy, ptr %addr, align 4
+  %sly = shl nuw <2 x i32> %zext.y, <i32 8, i32 8>
+  %yx = or <2 x i32> %sly, %zext.x
+  ret <2 x i32> %yx
+}
+
 ; Negative test - an oversized shift in the narrow type would produce the wrong value.
 
 define i8 @unmasked_shlop_unmasked_shift_amount(i32 %x, i32 %y, i32 %shamt) {

>From b7810a14756973d0e00eadfe2607d5424b8b0a3c Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Fri, 20 Oct 2023 14:49:38 +0800
Subject: [PATCH 3/5] Support padding zero between high and low.

---
 .../InstCombine/InstCombineAndOrXor.cpp       | 36 +++++++----
 llvm/test/Transforms/InstCombine/funnel.ll    | 62 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 6f0fa9bbe6e807f..2b4c5317d4b2d5f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2842,13 +2842,17 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC,
 
     FShiftArgs = {ShVal0, ShVal1, ShAmt};
   } else if (isa<ZExtInst>(Or0) || isa<ZExtInst>(Or1)) {
-    // If there are two 'or' instructions concat variables in opposite order,
-    // the latter one can be safely convert to fshl.
+    // If there are two 'or' instructions concat variables in opposite order:
     //
-    // LowHigh = or (shl (zext Low), Width - ZextHighShlAmt), (zext High)
+    // Slot1 and Slot2 are all zero bits.
+    // | Slot1 | Low | Slot2 | High |
+    // LowHigh = or (shl (zext Low), ZextLowShlAmt), (zext High)
+    // | Slot2 | High | Slot1 | Low |
     // HighLow = or (shl (zext High), ZextHighShlAmt), (zext Low)
-    // ->
-    // HighLow = fshl LowHigh, LowHigh, ZextHighShlAmt
+    //
+    // the latter 'or' can be safely convert to
+    // -> HighLow = fshl LowHigh, LowHigh, ZextHighShlAmt
+    // if ZextLowShlAmt + ZextHighShlAmt == Width.
     if (!isa<ZExtInst>(Or1))
       std::swap(Or0, Or1);
 
@@ -2864,7 +2868,9 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC,
 
     unsigned HighSize = High->getType()->getScalarSizeInBits();
     unsigned LowSize = Low->getType()->getScalarSizeInBits();
-    if (*ZextHighShlAmt != LowSize || HighSize + LowSize != Width)
+    // Make sure High does not overlap with Low and most significant bits of
+    // High aren't shifted out.
+    if (ZextHighShlAmt->ult(LowSize) || ZextHighShlAmt->ugt(Width - HighSize))
       return nullptr;
 
     for (User *U : ZextHigh->users()) {
@@ -2875,11 +2881,19 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC,
       if (!isa<ZExtInst>(Y))
         std::swap(X, Y);
 
-      if (match(X, m_Shl(m_Specific(Or1), m_SpecificInt(HighSize))) &&
-          match(Y, m_Specific(ZextHigh)) && DT.dominates(U, &Or)) {
-        FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)};
-        break;
-      }
+      const APInt *ZextLowShlAmt;
+      if (!match(X, m_Shl(m_Specific(Or1), m_APInt(ZextLowShlAmt))) ||
+          !match(Y, m_Specific(ZextHigh)) || !DT.dominates(U, &Or))
+        continue;
+
+      // Make sure Low does not overlap with High and most significant bits of
+      // Low aren't shifted out and we can rotate shift LowHigh to HighLow.
+      if (ZextLowShlAmt->ult(HighSize) || ZextLowShlAmt->ugt(Width - LowSize) ||
+          *ZextLowShlAmt + *ZextHighShlAmt != Width)
+        continue;
+
+      FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)};
+      break;
     }
   }
 
diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index c5bd1aa7b4351bc..b7fcca4fbe15cb8 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -356,8 +356,8 @@ define <2 x i64> @fshl_select_vector(<2 x i64> %x, <2 x i64> %y, <2 x i64> %sham
 
 ; Convert 'or concat' to fshl if opposite 'or concat' exists.
 
-define i32 @fshl_concat(i8 %x, i24 %y, ptr %addr) {
-; CHECK-LABEL: @fshl_concat(
+define i32 @fshl_concat_i8_i24(i8 %x, i24 %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_i8_i24(
 ; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 24
 ; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i24 [[Y:%.*]] to i32
@@ -376,6 +376,64 @@ define i32 @fshl_concat(i8 %x, i24 %y, ptr %addr) {
   ret i32 %yx
 }
 
+define i32 @fshl_concat_i8_i8(i8 %x, i8 %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_i8_i8(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 13
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
+; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 19)
+; CHECK-NEXT:    ret i32 [[YX]]
+;
+  %zext.x = zext i8 %x to i32
+  %slx = shl nuw i32 %zext.x, 13
+  %zext.y = zext i8 %y to i32
+  %xy = or i32 %zext.y, %slx
+  store i32 %xy, ptr %addr, align 4
+  %sly = shl nuw i32 %zext.y, 19
+  %yx = or i32 %zext.x, %sly
+  ret i32 %yx
+}
+
+define i32 @fshl_concat_i16_i16_overlap_drop(i16 %x, i16 %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_i16_i16_overlap_drop(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 17
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i16 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
+; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 15
+; CHECK-NEXT:    [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
+; CHECK-NEXT:    ret i32 [[YX]]
+;
+  %zext.x = zext i16 %x to i32
+  %slx = shl nuw i32 %zext.x, 17
+  %zext.y = zext i16 %y to i32
+  %xy = or i32 %zext.y, %slx
+  store i32 %xy, ptr %addr, align 4
+  %sly = shl nuw i32 %zext.y, 15
+  %yx = or i32 %zext.x, %sly
+  ret i32 %yx
+}
+
+define i32 @fshl_concat_unknown_source(i32 %zext.x, i32 %zext.y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_unknown_source(
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X:%.*]], 16
+; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y:%.*]]
+; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[SLY:%.*]] = shl nuw i32 [[ZEXT_Y]], 16
+; CHECK-NEXT:    [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
+; CHECK-NEXT:    ret i32 [[YX]]
+;
+  %slx = shl nuw i32 %zext.x, 16
+  %xy = or i32 %zext.y, %slx
+  store i32 %xy, ptr %addr, align 4
+  %sly = shl nuw i32 %zext.y, 16
+  %yx = or i32 %zext.x, %sly
+  ret i32 %yx
+}
+
 define <2 x i32> @fshl_concat_vector(<2 x i8> %x, <2 x i24> %y, ptr %addr) {
 ; CHECK-LABEL: @fshl_concat_vector(
 ; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>

>From 116e529073162f90746eb186d1e433434b06772f Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Mon, 23 Oct 2023 16:25:39 +0800
Subject: [PATCH 4/5] Simplify code. Add more tests

---
 .../InstCombine/InstCombineAndOrXor.cpp       | 12 ++--
 llvm/test/Transforms/InstCombine/funnel.ll    | 64 ++++++++++++++++---
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2b4c5317d4b2d5f..05501b0b8c7f130 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2886,12 +2886,16 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC,
           !match(Y, m_Specific(ZextHigh)) || !DT.dominates(U, &Or))
         continue;
 
-      // Make sure Low does not overlap with High and most significant bits of
-      // Low aren't shifted out and we can rotate shift LowHigh to HighLow.
-      if (ZextLowShlAmt->ult(HighSize) || ZextLowShlAmt->ugt(Width - LowSize) ||
-          *ZextLowShlAmt + *ZextHighShlAmt != Width)
+      // HighLow is good concat. If sum of two shifts amount equals to Width,
+      // LowHigh must also be a good concat.
+      if (*ZextLowShlAmt + *ZextHighShlAmt != Width)
         continue;
 
+      // Low must not overlap with High and most significant bits of Low must
+      // not be shifted out.
+      assert(ZextLowShlAmt->uge(HighSize) &&
+             ZextLowShlAmt->ule(Width - LowSize) && "Invalid concat");
+
       FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)};
       break;
     }
diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index b7fcca4fbe15cb8..c3fb164e4bc7d42 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -396,23 +396,67 @@ define i32 @fshl_concat_i8_i8(i8 %x, i8 %y, ptr %addr) {
   ret i32 %yx
 }
 
-define i32 @fshl_concat_i16_i16_overlap_drop(i16 %x, i16 %y, ptr %addr) {
-; CHECK-LABEL: @fshl_concat_i16_i16_overlap_drop(
-; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 17
-; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i16 [[Y:%.*]] to i32
+define i32 @fshl_concat_i8_i8_overlap(i8 %x, i8 %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_i8_i8_overlap(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 25
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
+; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 7
+; CHECK-NEXT:    [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
+; CHECK-NEXT:    ret i32 [[YX]]
+;
+  ; Test sly overlap.
+  %zext.x = zext i8 %x to i32
+  %slx = shl nuw i32 %zext.x, 25
+  %zext.y = zext i8 %y to i32
+  %xy = or i32 %zext.y, %slx
+  store i32 %xy, ptr %addr, align 4
+  %sly = shl nuw i32 %zext.y, 7
+  %yx = or i32 %zext.x, %sly
+  ret i32 %yx
+}
+
+define i32 @fshl_concat_i8_i8_drop(i8 %x, i8 %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_i8_i8_drop(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 7
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
 ; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 15
+; CHECK-NEXT:    [[SLY:%.*]] = shl nuw i32 [[ZEXT_Y]], 25
 ; CHECK-NEXT:    [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
 ; CHECK-NEXT:    ret i32 [[YX]]
 ;
-  %zext.x = zext i16 %x to i32
-  %slx = shl nuw i32 %zext.x, 17
-  %zext.y = zext i16 %y to i32
+  ; Test sly drop.
+  %zext.x = zext i8 %x to i32
+  %slx = shl nuw i32 %zext.x, 7
+  %zext.y = zext i8 %y to i32
+  %xy = or i32 %zext.y, %slx
+  store i32 %xy, ptr %addr, align 4
+  %sly = shl nuw i32 %zext.y, 25
+  %yx = or i32 %zext.x, %sly
+  ret i32 %yx
+}
+
+define i32 @fshl_concat_i8_i8_different_slot(i8 %x, i8 %y, ptr %addr) {
+; CHECK-LABEL: @fshl_concat_i8_i8_different_slot(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 9
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
+; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 22
+; CHECK-NEXT:    [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
+; CHECK-NEXT:    ret i32 [[YX]]
+;
+  %zext.x = zext i8 %x to i32
+  %slx = shl nuw i32 %zext.x, 9
+  %zext.y = zext i8 %y to i32
   %xy = or i32 %zext.y, %slx
   store i32 %xy, ptr %addr, align 4
-  %sly = shl nuw i32 %zext.y, 15
+  %sly = shl nuw i32 %zext.y, 22
   %yx = or i32 %zext.x, %sly
   ret i32 %yx
 }

>From c882da87460276aa5c75103b6e237e793974a713 Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Tue, 24 Oct 2023 08:34:49 +0800
Subject: [PATCH 5/5] Fix nuw flag

---
 llvm/test/Transforms/InstCombine/funnel.ll | 36 +++++++++++-----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index c3fb164e4bc7d42..dd8cb2d153fdac6 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -367,11 +367,11 @@ define i32 @fshl_concat_i8_i24(i8 %x, i24 %y, ptr %addr) {
 ; CHECK-NEXT:    ret i32 [[YX]]
 ;
   %zext.x = zext i8 %x to i32
-  %slx = shl nuw i32 %zext.x, 24
+  %slx = shl i32 %zext.x, 24
   %zext.y = zext i24 %y to i32
   %xy = or i32 %zext.y, %slx
   store i32 %xy, ptr %addr, align 4
-  %sly = shl nuw i32 %zext.y, 8
+  %sly = shl i32 %zext.y, 8
   %yx = or i32 %zext.x, %sly
   ret i32 %yx
 }
@@ -387,11 +387,11 @@ define i32 @fshl_concat_i8_i8(i8 %x, i8 %y, ptr %addr) {
 ; CHECK-NEXT:    ret i32 [[YX]]
 ;
   %zext.x = zext i8 %x to i32
-  %slx = shl nuw i32 %zext.x, 13
+  %slx = shl i32 %zext.x, 13
   %zext.y = zext i8 %y to i32
   %xy = or i32 %zext.y, %slx
   store i32 %xy, ptr %addr, align 4
-  %sly = shl nuw i32 %zext.y, 19
+  %sly = shl i32 %zext.y, 19
   %yx = or i32 %zext.x, %sly
   ret i32 %yx
 }
@@ -399,7 +399,7 @@ define i32 @fshl_concat_i8_i8(i8 %x, i8 %y, ptr %addr) {
 define i32 @fshl_concat_i8_i8_overlap(i8 %x, i8 %y, ptr %addr) {
 ; CHECK-LABEL: @fshl_concat_i8_i8_overlap(
 ; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 25
+; CHECK-NEXT:    [[SLX:%.*]] = shl i32 [[ZEXT_X]], 25
 ; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
 ; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
@@ -409,11 +409,11 @@ define i32 @fshl_concat_i8_i8_overlap(i8 %x, i8 %y, ptr %addr) {
 ;
   ; Test sly overlap.
   %zext.x = zext i8 %x to i32
-  %slx = shl nuw i32 %zext.x, 25
+  %slx = shl i32 %zext.x, 25
   %zext.y = zext i8 %y to i32
   %xy = or i32 %zext.y, %slx
   store i32 %xy, ptr %addr, align 4
-  %sly = shl nuw i32 %zext.y, 7
+  %sly = shl i32 %zext.y, 7
   %yx = or i32 %zext.x, %sly
   ret i32 %yx
 }
@@ -425,17 +425,17 @@ define i32 @fshl_concat_i8_i8_drop(i8 %x, i8 %y, ptr %addr) {
 ; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
 ; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[SLY:%.*]] = shl nuw i32 [[ZEXT_Y]], 25
+; CHECK-NEXT:    [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 25
 ; CHECK-NEXT:    [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
 ; CHECK-NEXT:    ret i32 [[YX]]
 ;
   ; Test sly drop.
   %zext.x = zext i8 %x to i32
-  %slx = shl nuw i32 %zext.x, 7
+  %slx = shl i32 %zext.x, 7
   %zext.y = zext i8 %y to i32
   %xy = or i32 %zext.y, %slx
   store i32 %xy, ptr %addr, align 4
-  %sly = shl nuw i32 %zext.y, 25
+  %sly = shl i32 %zext.y, 25
   %yx = or i32 %zext.x, %sly
   ret i32 %yx
 }
@@ -452,28 +452,28 @@ define i32 @fshl_concat_i8_i8_different_slot(i8 %x, i8 %y, ptr %addr) {
 ; CHECK-NEXT:    ret i32 [[YX]]
 ;
   %zext.x = zext i8 %x to i32
-  %slx = shl nuw i32 %zext.x, 9
+  %slx = shl i32 %zext.x, 9
   %zext.y = zext i8 %y to i32
   %xy = or i32 %zext.y, %slx
   store i32 %xy, ptr %addr, align 4
-  %sly = shl nuw i32 %zext.y, 22
+  %sly = shl i32 %zext.y, 22
   %yx = or i32 %zext.x, %sly
   ret i32 %yx
 }
 
 define i32 @fshl_concat_unknown_source(i32 %zext.x, i32 %zext.y, ptr %addr) {
 ; CHECK-LABEL: @fshl_concat_unknown_source(
-; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i32 [[ZEXT_X:%.*]], 16
+; CHECK-NEXT:    [[SLX:%.*]] = shl i32 [[ZEXT_X:%.*]], 16
 ; CHECK-NEXT:    [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y:%.*]]
 ; CHECK-NEXT:    store i32 [[XY]], ptr [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[SLY:%.*]] = shl nuw i32 [[ZEXT_Y]], 16
+; CHECK-NEXT:    [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 16
 ; CHECK-NEXT:    [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
 ; CHECK-NEXT:    ret i32 [[YX]]
 ;
-  %slx = shl nuw i32 %zext.x, 16
+  %slx = shl i32 %zext.x, 16
   %xy = or i32 %zext.y, %slx
   store i32 %xy, ptr %addr, align 4
-  %sly = shl nuw i32 %zext.y, 16
+  %sly = shl i32 %zext.y, 16
   %yx = or i32 %zext.x, %sly
   ret i32 %yx
 }
@@ -489,11 +489,11 @@ define <2 x i32> @fshl_concat_vector(<2 x i8> %x, <2 x i24> %y, ptr %addr) {
 ; CHECK-NEXT:    ret <2 x i32> [[YX]]
 ;
   %zext.x = zext <2 x i8> %x to <2 x i32>
-  %slx = shl nuw <2 x i32> %zext.x, <i32 24, i32 24>
+  %slx = shl <2 x i32> %zext.x, <i32 24, i32 24>
   %zext.y = zext <2 x i24> %y to <2 x i32>
   %xy = or <2 x i32> %slx, %zext.y
   store <2 x i32> %xy, ptr %addr, align 4
-  %sly = shl nuw <2 x i32> %zext.y, <i32 8, i32 8>
+  %sly = shl <2 x i32> %zext.y, <i32 8, i32 8>
   %yx = or <2 x i32> %sly, %zext.x
   ret <2 x i32> %yx
 }