[llvm] goldsteinn/recommit sh ro cmp (PR #71729)

Wed Nov 8 12:01:51 PST 2023

https://github.com/goldsteinn created https://github.com/llvm/llvm-project/pull/71729

- [X86] Add more tests for transform `(icmp eq/ne (and X,C0),(shift X,C1))`; PR71598
- Recommit "[DAGCombiner] Transform `(icmp eq/ne (and X,C0),(shift X,C1))` to use rotate or to getter constants." (2nd Try)


>From d157922f35146c6a4cdb2599330dad1e257a1942 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 8 Nov 2023 12:45:53 -0600
Subject: [PATCH 1/2] [X86] Add more tests for transform `(icmp eq/ne (and
 X,C0),(shift X,C1))`; PR71598

---
 llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll | 225 ++++++++++++++++++++++
 1 file changed, 225 insertions(+)

diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
index 8ec142acb71d4ce..063f8c606624dde 100644
--- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
+++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
@@ -570,6 +570,231 @@ define <16 x i1> @shl_to_ror_eq_16xi16_s8_fail_preserve_i16(<16 x i16> %x) {
   ret <16 x i1> %r
 }
 
+define i1 @shr_to_shl_eq_i32_s5_fail_doesnt_add_up(i32 %x) {
+; CHECK-LABEL: shr_to_shl_eq_i32_s5_fail_doesnt_add_up:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $7, %eax
+; CHECK-NEXT:    shrl $5, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 7
+  %sh = lshr i32 %x, 5
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shr_to_shl_eq_i8_s5_fail_doesnt_add_up2(i32 %x) {
+; CHECK-LABEL: shr_to_shl_eq_i8_s5_fail_doesnt_add_up2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $268435455, %eax # imm = 0xFFFFFFF
+; CHECK-NEXT:    shrl $5, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 268435455
+  %sh = lshr i32 %x, 5
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shr_to_shl_eq_i8_s5_fail_doesnt_add_up3(i32 %x) {
+; CHECK-LABEL: shr_to_shl_eq_i8_s5_fail_doesnt_add_up3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $67108863, %eax # imm = 0x3FFFFFF
+; CHECK-NEXT:    shrl $5, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 67108863
+  %sh = lshr i32 %x, 5
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shr_to_shl_eq_i8_s5_fail_doesnt_not_mask(i32 %x) {
+; CHECK-LABEL: shr_to_shl_eq_i8_s5_fail_doesnt_not_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $11, %eax
+; CHECK-NEXT:    shrl $5, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 11
+  %sh = lshr i32 %x, 5
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9_fail_wrong_mask(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9_fail_wrong_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $511, %eax # imm = 0x1FF
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 511
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shr_to_shl_eq_i32_s5_fail_wrong_mask(i32 %x) {
+; CHECK-LABEL: shr_to_shl_eq_i32_s5_fail_wrong_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $-32, %eax
+; CHECK-NEXT:    shrl $5, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, -32
+  %sh = lshr i32 %x, 5
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9_fail_doesnt_add_up(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9_fail_doesnt_add_up:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $65024, %eax # imm = 0xFE00
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 65024
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9_fail_doesnt_add_up2(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9_fail_doesnt_add_up2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $-1024, %eax # imm = 0xFC00
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, -1024
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9_fail_doesnt_add_up3(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9_fail_doesnt_add_up3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $-256, %eax
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, -256
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9_fail_not_mask(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9_fail_not_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $-511, %eax # imm = 0xFE01
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, -511
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9_fail_not_mask2(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9_fail_not_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $-255, %eax
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, -255
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9_fail_wrong_mask2(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9_fail_wrong_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $8388607, %eax # imm = 0x7FFFFF
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 8388607
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shl_to_shr_eq_i32_s9(i32 %x) {
+; CHECK-LABEL: shl_to_shr_eq_i32_s9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $-512, %eax # imm = 0xFE00
+; CHECK-NEXT:    shll $9, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, -512
+  %sh = shl i32 %x, 9
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shr_to_shl_eq_i32_s5(i32 %x) {
+; CHECK-LABEL: shr_to_shl_eq_i32_s5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF
+; CHECK-NEXT:    shrl $5, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 134217727
+  %sh = lshr i32 %x, 5
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
+define i1 @shr_to_rotate_eq_i32_s5(i32 %x) {
+; CHECK-LABEL: shr_to_rotate_eq_i32_s5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $268435455, %eax # imm = 0xFFFFFFF
+; CHECK-NEXT:    shrl $4, %edi
+; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %x, 268435455
+  %sh = lshr i32 %x, 4
+  %r = icmp eq i32 %and, %sh
+  ret i1 %r
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-AVX: {{.*}}
 ; CHECK-NOBMI-SSE2: {{.*}}

>From d3bdaacec65fe65ea127f9fc163df8dcaaef1e06 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 8 Nov 2023 12:35:58 -0600
Subject: [PATCH 2/2] Recommit "[DAGCombiner] Transform `(icmp eq/ne (and
 X,C0),(shift X,C1))` to use rotate or to getter constants." (2nd Try)

Added missing check that the mask and shift amount added up to correct
bitwidth as well as test cases for the bug.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  18 ++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 135 +++++++++++--
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  67 +++++++
 llvm/lib/Target/X86/X86ISelLowering.h         |   5 +
 llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll     | 180 +++++++++---------
 5 files changed, 305 insertions(+), 100 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 58aad70c4bb36e6..c87537291e3b161 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -832,6 +832,24 @@ class TargetLoweringBase {
     return N->getOpcode() == ISD::FDIV;
   }
 
+  // Given:
+  //    (icmp eq/ne (and X, C0), (shift X, C1))
+  // or
+  //    (icmp eq/ne X, (rotate X, CPow2))
+
+  // If C0 is a mask or shifted mask and the shift amt (C1) isolates the
+  // remaining bits (i.e something like `(x64 & UINT32_MAX) == (x64 >> 32)`)
+  // Do we prefer the shift to be shift-right, shift-left, or rotate.
+  // Note: Its only valid to convert the rotate version to the shift version iff
+  // the shift-amt (`C1`) is a power of 2 (including 0).
+  // If ShiftOpc (current Opcode) is returned, do nothing.
+  virtual unsigned preferedOpcodeForCmpEqPiecesOfOperand(
+      EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
+      const APInt &ShiftOrRotateAmt,
+      const std::optional<APInt> &AndMask) const {
+    return ShiftOpc;
+  }
+
   /// These two forms are equivalent:
   ///   sub %y, (xor %x, -1)
   ///   add (add %x, 1), %y
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a867d88f76c0cf6..3fe7be2821e4b15 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12456,27 +12456,132 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
 
   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
   EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
 
-  SDValue Combined = SimplifySetCC(VT, N->getOperand(0), N->getOperand(1), Cond,
-                                   SDLoc(N), !PreferSetCC);
-
-  if (!Combined)
-    return SDValue();
+  SDValue Combined = SimplifySetCC(VT, N0, N1, Cond, SDLoc(N), !PreferSetCC);
 
-  // If we prefer to have a setcc, and we don't, we'll try our best to
-  // recreate one using rebuildSetCC.
-  if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
-    SDValue NewSetCC = rebuildSetCC(Combined);
+  if (Combined) {
+    // If we prefer to have a setcc, and we don't, we'll try our best to
+    // recreate one using rebuildSetCC.
+    if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
+      SDValue NewSetCC = rebuildSetCC(Combined);
 
-    // We don't have anything interesting to combine to.
-    if (NewSetCC.getNode() == N)
-      return SDValue();
+      // We don't have anything interesting to combine to.
+      if (NewSetCC.getNode() == N)
+        return SDValue();
 
-    if (NewSetCC)
-      return NewSetCC;
+      if (NewSetCC)
+        return NewSetCC;
+    }
+    return Combined;
   }
 
-  return Combined;
+  // Optimize
+  //    1) (icmp eq/ne (and X, C0), (shift X, C1))
+  // or
+  //    2) (icmp eq/ne X, (rotate X, C1))
+  // If C0 is a mask or shifted mask and the shift amt (C1) isolates the
+  // remaining bits (i.e something like `(x64 & UINT32_MAX) == (x64 >> 32)`)
+  // Then:
+  // If C1 is a power of 2, then the rotate and shift+and versions are
+  // equivilent, so we can interchange them depending on target preference.
+  // Otherwise, if we have the shift+and version we can interchange srl/shl
+  // which inturn affects the constant C0. We can use this to get better
+  // constants again determined by target preference.
+  if (Cond == ISD::SETNE || Cond == ISD::SETEQ) {
+    auto IsAndWithShift = [](SDValue A, SDValue B) {
+      return A.getOpcode() == ISD::AND &&
+             (B.getOpcode() == ISD::SRL || B.getOpcode() == ISD::SHL) &&
+             A.getOperand(0) == B.getOperand(0);
+    };
+    auto IsRotateWithOp = [](SDValue A, SDValue B) {
+      return (B.getOpcode() == ISD::ROTL || B.getOpcode() == ISD::ROTR) &&
+             B.getOperand(0) == A;
+    };
+    SDValue AndOrOp = SDValue(), ShiftOrRotate = SDValue();
+    bool IsRotate = false;
+
+    // Find either shift+and or rotate pattern.
+    if (IsAndWithShift(N0, N1)) {
+      AndOrOp = N0;
+      ShiftOrRotate = N1;
+    } else if (IsAndWithShift(N1, N0)) {
+      AndOrOp = N1;
+      ShiftOrRotate = N0;
+    } else if (IsRotateWithOp(N0, N1)) {
+      IsRotate = true;
+      AndOrOp = N0;
+      ShiftOrRotate = N1;
+    } else if (IsRotateWithOp(N1, N0)) {
+      IsRotate = true;
+      AndOrOp = N1;
+      ShiftOrRotate = N0;
+    }
+
+    if (AndOrOp && ShiftOrRotate && ShiftOrRotate.hasOneUse() &&
+        (IsRotate || AndOrOp.hasOneUse())) {
+      EVT OpVT = N0.getValueType();
+      // Get constant shift/rotate amount and possibly mask (if its shift+and
+      // variant).
+      auto GetAPIntValue = [](SDValue Op) -> std::optional<APInt> {
+        ConstantSDNode *CNode = isConstOrConstSplat(Op, /*AllowUndefs*/ false,
+                                                    /*AllowTrunc*/ false);
+        if (CNode == nullptr)
+          return std::nullopt;
+        return CNode->getAPIntValue();
+      };
+      std::optional<APInt> AndCMask =
+          IsRotate ? std::nullopt : GetAPIntValue(AndOrOp.getOperand(1));
+      std::optional<APInt> ShiftCAmt =
+          GetAPIntValue(ShiftOrRotate.getOperand(1));
+      unsigned NumBits = OpVT.getScalarSizeInBits();
+
+      // We found constants.
+      if (ShiftCAmt && (IsRotate || AndCMask) && ShiftCAmt->ult(NumBits)) {
+        unsigned ShiftOpc = ShiftOrRotate.getOpcode();
+        // Check that the constants meet the constraints.
+        bool CanTransform = IsRotate;
+        if (!CanTransform) {
+          // Check that mask and shift compliment eachother
+          CanTransform = *ShiftCAmt == (~*AndCMask).popcount();
+          // Check that we are comparing all bits
+          CanTransform &= (*ShiftCAmt + AndCMask->popcount()) == NumBits;
+          // Check that the and mask is correct for the shift
+          CanTransform &=
+              ShiftOpc == ISD::SHL ? (~*AndCMask).isMask() : AndCMask->isMask();
+        }
+
+        // See if target prefers another shift/rotate opcode.
+        unsigned NewShiftOpc = TLI.preferedOpcodeForCmpEqPiecesOfOperand(
+            OpVT, ShiftOpc, ShiftCAmt->isPowerOf2(), *ShiftCAmt, AndCMask);
+        // Transform is valid and we have a new preference.
+        if (CanTransform && NewShiftOpc != ShiftOpc) {
+          SDLoc DL(N);
+          SDValue NewShiftOrRotate =
+              DAG.getNode(NewShiftOpc, DL, OpVT, ShiftOrRotate.getOperand(0),
+                          ShiftOrRotate.getOperand(1));
+          SDValue NewAndOrOp = SDValue();
+
+          if (NewShiftOpc == ISD::SHL || NewShiftOpc == ISD::SRL) {
+            APInt NewMask =
+                NewShiftOpc == ISD::SHL
+                    ? APInt::getHighBitsSet(NumBits,
+                                            NumBits - ShiftCAmt->getZExtValue())
+                    : APInt::getLowBitsSet(NumBits,
+                                           NumBits - ShiftCAmt->getZExtValue());
+            NewAndOrOp =
+                DAG.getNode(ISD::AND, DL, OpVT, ShiftOrRotate.getOperand(0),
+                            DAG.getConstant(NewMask, DL, OpVT));
+          } else {
+            NewAndOrOp = ShiftOrRotate.getOperand(0);
+          }
+
+          return DAG.getSetCC(DL, VT, NewAndOrOp, NewShiftOrRotate, Cond);
+        }
+      }
+    }
+  }
+  return SDValue();
 }
 
 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e6045a4de51ebe2..8002aa0141910c3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3257,6 +3257,73 @@ bool X86TargetLowering::
   return NewShiftOpcode == ISD::SHL;
 }
 
+unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand(
+    EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
+    const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
+  if (!VT.isInteger())
+    return ShiftOpc;
+
+  bool PreferRotate = false;
+  if (VT.isVector()) {
+    // For vectors, if we have rotate instruction support, then its definetly
+    // best. Otherwise its not clear what the best so just don't make changed.
+    PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
+                                             VT.getScalarType() == MVT::i64);
+  } else {
+    // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
+    // rotate unless we have a zext mask+shr.
+    PreferRotate = Subtarget.hasBMI2();
+    if (!PreferRotate) {
+      unsigned MaskBits =
+          VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
+      PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
+    }
+  }
+
+  if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
+    assert(AndMask.has_value() && "Null andmask when querying about shift+and");
+
+    if (PreferRotate && MayTransformRotate)
+      return ISD::ROTL;
+
+    // If vector we don't really get much benefit swapping around constants.
+    // Maybe we could check if the DAG has the flipped node already in the
+    // future.
+    if (VT.isVector())
+      return ShiftOpc;
+
+    // See if the beneficial to swap shift type.
+    if (ShiftOpc == ISD::SHL) {
+      // If the current setup has imm64 mask, then inverse will have
+      // at least imm32 mask (or be zext i32 -> i64).
+      if (VT == MVT::i64)
+        return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
+                                                  : ShiftOpc;
+
+      // We can only benefit if req at least 7-bit for the mask. We
+      // don't want to replace shl of 1,2,3 as they can be implemented
+      // with lea/add.
+      return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
+    }
+
+    if (VT == MVT::i64)
+      // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
+      // extremely efficient.
+      return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
+
+    // Keep small shifts as shl so we can generate add/lea.
+    return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
+  }
+
+  // We prefer rotate for vectors of if we won't get a zext mask with SRL
+  // (PreferRotate will be set in the latter case).
+  if (PreferRotate || VT.isVector())
+    return ShiftOpc;
+
+  // Non-vector type and we have a zext mask with SRL.
+  return ISD::SRL;
+}
+
 bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
   return N->getOpcode() != ISD::FP_EXTEND;
 }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 8046f42736951cd..3b1b2603fd8fc61 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1138,6 +1138,11 @@ namespace llvm {
         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
         SelectionDAG &DAG) const override;
 
+    unsigned preferedOpcodeForCmpEqPiecesOfOperand(
+        EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
+        const APInt &ShiftOrRotateAmt,
+        const std::optional<APInt> &AndMask) const override;
+
     bool preferScalarizeSplat(SDNode *N) const override;
 
     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
index 063f8c606624dde..7996454a0158eac 100644
--- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
+++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
@@ -20,9 +20,8 @@ define i1 @shr_to_shl_eq_i8_s2(i8 %x) {
 ; CHECK-LABEL: shr_to_shl_eq_i8_s2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $63, %al
-; CHECK-NEXT:    shrb $2, %dil
-; CHECK-NEXT:    cmpb %dil, %al
+; CHECK-NEXT:    rolb $2, %al
+; CHECK-NEXT:    cmpb %al, %dil
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %and = and i8 %x, 63
@@ -35,9 +34,9 @@ define i1 @shl_to_shr_ne_i8_s7(i8 %x) {
 ; CHECK-LABEL: shl_to_shr_ne_i8_s7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shlb $7, %al
-; CHECK-NEXT:    andb $-128, %dil
-; CHECK-NEXT:    cmpb %dil, %al
+; CHECK-NEXT:    shrb $7, %al
+; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    cmpb %al, %dil
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
   %shl = shl i8 %x, 7
@@ -63,9 +62,8 @@ define i1 @shr_to_shl_eq_i8_s1(i8 %x) {
 ; CHECK-LABEL: shr_to_shl_eq_i8_s1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $127, %al
-; CHECK-NEXT:    shrb %dil
-; CHECK-NEXT:    cmpb %dil, %al
+; CHECK-NEXT:    rolb %al
+; CHECK-NEXT:    cmpb %al, %dil
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %and = and i8 %x, 127
@@ -77,10 +75,10 @@ define i1 @shr_to_shl_eq_i8_s1(i8 %x) {
 define i1 @shr_to_shl_eq_i32_s3(i32 %x) {
 ; CHECK-LABEL: shr_to_shl_eq_i32_s3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $536870911, %eax # imm = 0x1FFFFFFF
-; CHECK-NEXT:    shrl $3, %edi
-; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (,%rdi,8), %eax
+; CHECK-NEXT:    andl $-8, %edi
+; CHECK-NEXT:    cmpl %eax, %edi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %and = and i32 %x, 536870911
@@ -105,14 +103,20 @@ define i1 @shl_to_shr_eq_i32_s3_fail(i32 %x) {
 }
 
 define i1 @shl_to_shr_ne_i32_s16(i32 %x) {
-; CHECK-LABEL: shl_to_shr_ne_i32_s16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll $16, %eax
-; CHECK-NEXT:    andl $-65536, %edi # imm = 0xFFFF0000
-; CHECK-NEXT:    cmpl %edi, %eax
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    retq
+; CHECK-NOBMI-LABEL: shl_to_shr_ne_i32_s16:
+; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movzwl %di, %eax
+; CHECK-NOBMI-NEXT:    shrl $16, %edi
+; CHECK-NOBMI-NEXT:    cmpl %edi, %eax
+; CHECK-NOBMI-NEXT:    setne %al
+; CHECK-NOBMI-NEXT:    retq
+;
+; CHECK-BMI2-LABEL: shl_to_shr_ne_i32_s16:
+; CHECK-BMI2:       # %bb.0:
+; CHECK-BMI2-NEXT:    rorxl $16, %edi, %eax
+; CHECK-BMI2-NEXT:    cmpl %eax, %edi
+; CHECK-BMI2-NEXT:    setne %al
+; CHECK-BMI2-NEXT:    retq
   %shl = shl i32 %x, 16
   %and = and i32 %x, 4294901760
   %r = icmp ne i32 %shl, %and
@@ -137,9 +141,8 @@ define i1 @shl_to_shr_ne_i32_s16_fail(i32 %x) {
 define i1 @shr_to_shl_eq_i16_s1(i16 %x) {
 ; CHECK-LABEL: shr_to_shl_eq_i16_s1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    andl $32767, %edi # imm = 0x7FFF
-; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    rolw %ax
 ; CHECK-NEXT:    cmpw %ax, %di
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
@@ -167,9 +170,9 @@ define i1 @shr_to_shl_eq_i16_s1_fail(i16 %x) {
 define i1 @shl_to_shr_eq_i64_s44(i64 %x) {
 ; CHECK-LABEL: shl_to_shr_eq_i64_s44:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $-17592186044416, %rax # imm = 0xFFFFF00000000000
-; CHECK-NEXT:    andq %rdi, %rax
-; CHECK-NEXT:    shlq $44, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shrq $44, %rax
+; CHECK-NEXT:    andl $1048575, %edi # imm = 0xFFFFF
 ; CHECK-NEXT:    cmpq %rax, %rdi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
@@ -180,13 +183,20 @@ define i1 @shl_to_shr_eq_i64_s44(i64 %x) {
 }
 
 define i1 @shr_to_shl_ne_i64_s32(i64 %x) {
-; CHECK-LABEL: shr_to_shl_ne_i64_s32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shrq $32, %rdi
-; CHECK-NEXT:    cmpq %rdi, %rax
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    retq
+; CHECK-NOBMI-LABEL: shr_to_shl_ne_i64_s32:
+; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    shrq $32, %rdi
+; CHECK-NOBMI-NEXT:    cmpq %rdi, %rax
+; CHECK-NOBMI-NEXT:    setne %al
+; CHECK-NOBMI-NEXT:    retq
+;
+; CHECK-BMI2-LABEL: shr_to_shl_ne_i64_s32:
+; CHECK-BMI2:       # %bb.0:
+; CHECK-BMI2-NEXT:    rorxq $32, %rdi, %rax
+; CHECK-BMI2-NEXT:    cmpq %rax, %rdi
+; CHECK-BMI2-NEXT:    setne %al
+; CHECK-BMI2-NEXT:    retq
   %and = and i64 %x, 4294967295
   %shr = lshr i64 %x, 32
   %r = icmp ne i64 %and, %shr
@@ -230,9 +240,9 @@ define i1 @ashr_to_shl_ne_i64_s32_fail(i64 %x) {
 define i1 @shl_to_shr_eq_i64_s63(i64 %x) {
 ; CHECK-LABEL: shl_to_shr_eq_i64_s63:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    andq %rdi, %rax
-; CHECK-NEXT:    shlq $63, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shrq $63, %rax
+; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    cmpq %rax, %rdi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
@@ -258,23 +268,14 @@ define i1 @shl_to_shr_eq_i64_s63_fail(i64 %x) {
 }
 
 define i1 @shr_to_shl_eq_i64_s7(i64 %x) {
-; CHECK-NOBMI-LABEL: shr_to_shl_eq_i64_s7:
-; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movabsq $144115188075855871, %rax # imm = 0x1FFFFFFFFFFFFFF
-; CHECK-NOBMI-NEXT:    andq %rdi, %rax
-; CHECK-NOBMI-NEXT:    shrq $7, %rdi
-; CHECK-NOBMI-NEXT:    cmpq %rdi, %rax
-; CHECK-NOBMI-NEXT:    sete %al
-; CHECK-NOBMI-NEXT:    retq
-;
-; CHECK-BMI2-LABEL: shr_to_shl_eq_i64_s7:
-; CHECK-BMI2:       # %bb.0:
-; CHECK-BMI2-NEXT:    movb $57, %al
-; CHECK-BMI2-NEXT:    bzhiq %rax, %rdi, %rax
-; CHECK-BMI2-NEXT:    shrq $7, %rdi
-; CHECK-BMI2-NEXT:    cmpq %rdi, %rax
-; CHECK-BMI2-NEXT:    sete %al
-; CHECK-BMI2-NEXT:    retq
+; CHECK-LABEL: shr_to_shl_eq_i64_s7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shlq $7, %rax
+; CHECK-NEXT:    andq $-128, %rdi
+; CHECK-NEXT:    cmpq %rax, %rdi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
   %and = and i64 %x, 144115188075855871
   %shr = lshr i64 %x, 7
   %r = icmp eq i64 %and, %shr
@@ -284,9 +285,8 @@ define i1 @shr_to_shl_eq_i64_s7(i64 %x) {
 define i1 @shl_to_shr_ne_i32_s24(i32 %x) {
 ; CHECK-LABEL: shl_to_shr_ne_i32_s24:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll $24, %eax
-; CHECK-NEXT:    andl $-16777216, %edi # imm = 0xFF000000
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    shrl $24, %edi
 ; CHECK-NEXT:    cmpl %edi, %eax
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
@@ -312,14 +312,20 @@ define i1 @shr_to_shl_ne_i32_s24_fail(i32 %x) {
 }
 
 define i1 @shr_to_shl_ne_i32_s8(i32 %x) {
-; CHECK-LABEL: shr_to_shl_ne_i32_s8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $16777215, %eax # imm = 0xFFFFFF
-; CHECK-NEXT:    shrl $8, %edi
-; CHECK-NEXT:    cmpl %edi, %eax
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    retq
+; CHECK-NOBMI-LABEL: shr_to_shl_ne_i32_s8:
+; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    roll $8, %eax
+; CHECK-NOBMI-NEXT:    cmpl %eax, %edi
+; CHECK-NOBMI-NEXT:    setne %al
+; CHECK-NOBMI-NEXT:    retq
+;
+; CHECK-BMI2-LABEL: shr_to_shl_ne_i32_s8:
+; CHECK-BMI2:       # %bb.0:
+; CHECK-BMI2-NEXT:    rorxl $24, %edi, %eax
+; CHECK-BMI2-NEXT:    cmpl %eax, %edi
+; CHECK-BMI2-NEXT:    setne %al
+; CHECK-BMI2-NEXT:    retq
   %and = and i32 %x, 16777215
   %shr = lshr i32 %x, 8
   %r = icmp ne i32 %and, %shr
@@ -359,9 +365,8 @@ define <4 x i1> @shr_to_ror_eq_4xi32_s4(<4 x i32> %x) {
 ;
 ; CHECK-AVX512-LABEL: shr_to_ror_eq_4xi32_s4:
 ; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vpsrld $4, %xmm0, %xmm1
-; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-AVX512-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; CHECK-AVX512-NEXT:    vprold $4, %xmm0, %xmm1
+; CHECK-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %shr = lshr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
@@ -402,9 +407,8 @@ define <4 x i1> @shl_to_ror_eq_4xi32_s8(<4 x i32> %x) {
 ;
 ; CHECK-AVX512-LABEL: shl_to_ror_eq_4xi32_s8:
 ; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vpslld $8, %xmm0, %xmm1
-; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-AVX512-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; CHECK-AVX512-NEXT:    vprold $8, %xmm0, %xmm1
+; CHECK-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %shr = shl <4 x i32> %x, <i32 8, i32 8, i32 8, i32 8>
@@ -754,9 +758,9 @@ define i1 @shl_to_shr_eq_i32_s9(i32 %x) {
 ; CHECK-LABEL: shl_to_shr_eq_i32_s9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $-512, %eax # imm = 0xFE00
-; CHECK-NEXT:    shll $9, %edi
-; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    shrl $9, %eax
+; CHECK-NEXT:    andl $8388607, %edi # imm = 0x7FFFFF
+; CHECK-NEXT:    cmpl %eax, %edi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %and = and i32 %x, -512
@@ -769,9 +773,9 @@ define i1 @shr_to_shl_eq_i32_s5(i32 %x) {
 ; CHECK-LABEL: shr_to_shl_eq_i32_s5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF
-; CHECK-NEXT:    shrl $5, %edi
-; CHECK-NEXT:    cmpl %edi, %eax
+; CHECK-NEXT:    shll $5, %eax
+; CHECK-NEXT:    andl $-32, %edi
+; CHECK-NEXT:    cmpl %eax, %edi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %and = and i32 %x, 134217727
@@ -781,14 +785,20 @@ define i1 @shr_to_shl_eq_i32_s5(i32 %x) {
 }
 
 define i1 @shr_to_rotate_eq_i32_s5(i32 %x) {
-; CHECK-LABEL: shr_to_rotate_eq_i32_s5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl $268435455, %eax # imm = 0xFFFFFFF
-; CHECK-NEXT:    shrl $4, %edi
-; CHECK-NEXT:    cmpl %edi, %eax
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; CHECK-NOBMI-LABEL: shr_to_rotate_eq_i32_s5:
+; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    roll $4, %eax
+; CHECK-NOBMI-NEXT:    cmpl %eax, %edi
+; CHECK-NOBMI-NEXT:    sete %al
+; CHECK-NOBMI-NEXT:    retq
+;
+; CHECK-BMI2-LABEL: shr_to_rotate_eq_i32_s5:
+; CHECK-BMI2:       # %bb.0:
+; CHECK-BMI2-NEXT:    rorxl $28, %edi, %eax
+; CHECK-BMI2-NEXT:    cmpl %eax, %edi
+; CHECK-BMI2-NEXT:    sete %al
+; CHECK-BMI2-NEXT:    retq
   %and = and i32 %x, 268435455
   %sh = lshr i32 %x, 4
   %r = icmp eq i32 %and, %sh