[libcxx-commits] [libcxx] [flang] [libc] [clang-tools-extra] [clang] [llvm] [compiler-rt] [libunwind] [lld] [lldb] [X86] Use RORX over SHR imm (PR #77964)

Sun Jan 28 08:55:03 PST 2024

================
@@ -4216,6 +4217,95 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
   return CNode;
 }
 
+// When the consumer of a right shift (arithmetic or logical) wouldn't notice
+// the difference if the instruction was a rotate right instead (because the
+// bits shifted in are truncated away), the shift can be replaced by the RORX
+// instruction from BMI2. This doesn't set flags and can output to a different
+// register. However, this increases code size in most cases, and doesn't leave
+// the high bits in a useful state. There may be other situations where this
+// transformation is profitable given those conditions, but currently the
+// transformation is only made when it likely avoids spilling flags.
+bool X86DAGToDAGISel::rightShiftUnclobberFlags(SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Target has to have BMI2 for RORX
+  if (!Subtarget->hasBMI2())
+    return false;
+
+  // Only handle scalar shifts.
+  if (VT.isVector())
+    return false;
+
+  unsigned OpSize;
+  if (VT == MVT::i64)
+    OpSize = 64;
+  else if (VT == MVT::i32)
+    OpSize = 32;
+  else if (VT == MVT::i16)
+    OpSize = 16;
+  else if (VT == MVT::i8)
+    return false; // i8 shift can't be truncated.
+  else
+    llvm_unreachable("Unexpected shift size");
+
+  unsigned TruncateSize = 0;
+  // This only works when the result is truncated.
+  for (const SDNode *User : N->uses()) {
+    if (!User->isMachineOpcode() ||
+        User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+      return false;
+    EVT TuncateType = User->getValueType(0);
+    if (TuncateType == MVT::i32)
+      TruncateSize = std::max(TruncateSize, 32U);
+    else if (TuncateType == MVT::i16)
+      TruncateSize = std::max(TruncateSize, 16U);
+    else if (TuncateType == MVT::i8)
+      TruncateSize = std::max(TruncateSize, 8U);
+    else
+      return false;
+  }
+  if (TruncateSize >= OpSize)
+    return false;
+
+  // The shift must be by an immediate that wouldn't expose the zero or sign
+  // extended result.
+  auto *ShiftAmount = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!ShiftAmount || ShiftAmount->getZExtValue() > OpSize - TruncateSize)
+    return false;
+
+  // If the shift argument has non-dead EFLAGS, then this shift probably
+  // clobbers those flags making the transformation to RORX useful. This may
+  // have false negatives or positives so ideally this transformation is made
+  // later on.
+  bool ArgProducesFlags = false;
+  SDNode *Input = N->getOperand(0).getNode();
+  for (auto Use : Input->uses()) {
+    if (Use->getOpcode() == ISD::CopyToReg) {
+      auto *RegisterNode =
+          dyn_cast<RegisterSDNode>(Use->getOperand(1).getNode());
----------------
RKSimon wrote:

```dyn_cast<RegisterSDNode>(Use->getOperand(1))```

https://github.com/llvm/llvm-project/pull/77964