[llvm] [RISCV] Add DAG combine to turn (sub (shl X, 8-Y), (shr X, Y)) into orc.b (PR #111828)

Thu Oct 10 09:12:17 PDT 2024

https://github.com/damokeev updated https://github.com/llvm/llvm-project/pull/111828

>From 78eb2d8d28ed2bf679ef4f25e40237d4d5ae6483 Mon Sep 17 00:00:00 2001
From: Daniel Mokeev <mokeev.gh at gmail.com>
Date: Wed, 9 Oct 2024 18:14:50 +0200
Subject: [PATCH 1/3] first version of orc b opts

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  72 +++-
 llvm/test/CodeGen/RISCV/orc-b-patterns.ll   | 345 ++++++++++++++++++++
 2 files changed, 414 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/orc-b-patterns.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 01fa418e4dbdf4..1619b684dd5818 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23,12 +23,14 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -13572,9 +13574,71 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS);
 }
 
+// Looks for (sub (shl X, 8-N), (shr X, N)) where the N-th bit in each byte is potentially set. Replace with orc.b. 
+static SDValue combineSubShiftToOrcBGeneralized(SDNode *N, SelectionDAG &DAG,
+                                     const RISCVSubtarget &Subtarget) {
+  if (!Subtarget.hasStdExtZbb())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  if (VT != Subtarget.getXLenVT() && VT != MVT::i32 && VT != MVT::i16)
+    return SDValue(); 
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  
+  if (N0->getOpcode() != ISD::SHL){
+    return SDValue();
+  }  
+  
+  auto *ShAmtCLeft = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!ShAmtCLeft)
+    return SDValue();
+  unsigned ShiftedAmount = 8 - ShAmtCLeft->getZExtValue();
+  SDValue LeftShiftOperand = N0->getOperand(0);
+  SDValue RightShiftOperand;
+
+  if (N1->getOpcode() == ISD::SRL){ // (sub (shl X, 8 - N), (srl X, N)) case
+    auto *ShAmtCRight = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    // Note that the (sub (X, (shr X, 8))) is a degenerate case that should not get optimized,
+    // as we would be replacing a subtraction with an orc.b
+    // (!N0.hasOneUse() && !N1.hasOneUse())
+    if (!ShAmtCRight || ShAmtCRight->getZExtValue() == 8 || ShAmtCRight->getZExtValue() != ShiftedAmount )
+    {
+        return SDValue();
+    }
+    if (!N0.hasOneUse() && !N1.hasOneUse()){
+      dbgs() << "Both operands both have > 1 use\n";
+      return SDValue();
+    }
+    RightShiftOperand = N1.getOperand(0);
+  }
+  else{ // (sub (shl X, 8), X) case
+    if (!N0.hasOneUse()){
+      dbgs() << "N0 has > 1 uses\n";
+      return SDValue();
+    }
+    // llvm::errs() << "N0 has " << N0->get;
+    RightShiftOperand = N1;
+  }
+
+  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0x1));
+  Mask <<= ShiftedAmount;
+  // Check that X has indeed the right shape (only the N-th bit can be set in every byte)
+  if(!DAG.MaskedValueIsZero(LeftShiftOperand, ~Mask))
+    return SDValue();
+
+  if (LeftShiftOperand != RightShiftOperand)
+    return SDValue();
+  dbgs() << "Optimized for node - " << N->getDebugLoc() << "\n";
+  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, LeftShiftOperand);
+}
+
+
 // Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are
 // non-zero. Replace with orc.b.
-static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
+__attribute__((unused)) static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
                                      const RISCVSubtarget &Subtarget) {
   if (!Subtarget.hasStdExtZbb())
     return SDValue();
@@ -13596,7 +13660,7 @@ static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
 
   APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe));
   if (!DAG.MaskedValueIsZero(N1, Mask))
-    return SDValue();
+    return SDValue(); 
 
   return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1);
 }
@@ -13623,7 +13687,9 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
 
   if (SDValue V = combineBinOpOfZExt(N, DAG))
     return V;
-  if (SDValue V = combineSubShiftToOrcB(N, DAG, Subtarget))
+  // if (SDValue V = combineSubShiftToOrcB(N, DAG, Subtarget))
+  //   return V;
+  if (SDValue V  = combineSubShiftToOrcBGeneralized(N, DAG, Subtarget))
     return V;
 
   // fold (sub x, (select lhs, rhs, cc, 0, y)) ->
diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
new file mode 100644
index 00000000000000..b7c67cf8930133
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
@@ -0,0 +1,345 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32ZBB
+
+define i32 @orc_b_i32_mul255(i32 %x) nounwind {
+; RV32I-LABEL: orc_b_i32_mul255:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_mul255:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 4112
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %mul = mul nuw nsw i32 %and, 255
+  ret i32 %mul
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_lsb(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 4112
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+define  i32 @orc_b_i32_sub_shl8x_x_b1(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 8224
+; RV32I-NEXT:    addi a1, a1, 514
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 8224
+; RV32ZBB-NEXT:    addi a1, a1, 514
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b2(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 16448
+; RV32I-NEXT:    addi a1, a1, 1028
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 6
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b2:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 16448
+; RV32ZBB-NEXT:    addi a1, a1, 1028
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 67372036
+  %shl = shl i32 %and, 6
+  %shr = lshr exact i32 %and, 2
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b3(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 24672
+; CHECK-NEXT:    addi a1, a1, 1542
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 101058054
+  %shl = shl nuw i32 %and, 5
+  %shr = lshr i32 %and, 3
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b4(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 32897
+; CHECK-NEXT:    addi a1, a1, -2040
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    srli a0, a0, 4
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 134744072
+  %shl = shl nuw i32 %and, 4
+  %shr = lshr i32 %and, 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b5(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 65793
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 5
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 269488144
+  %shl = shl nuw i32 %and, 3
+  %shr = lshr i32 %and, 5
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b6(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 131586
+; CHECK-NEXT:    addi a1, a1, 32
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 6
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 538976288
+  %shl = shl nuw i32 %and, 2
+  %shr = lshr i32 %and, 6
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b7(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 263172
+; CHECK-NEXT:    addi a1, a1, 64
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    srli a0, a0, 7
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 1077952576
+  %shl = shl nuw i32 %and, 1
+  %shr = lshr i32 %and, 7
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 8224
+; RV32I-NEXT:    addi a2, a2, 514
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 7
+; RV32I-NEXT:    srli a3, a0, 1
+; RV32I-NEXT:    sub a0, a2, a3
+; RV32I-NEXT:    sw a3, 0(a1)
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a2, 8224
+; RV32ZBB-NEXT:    addi a2, a2, 514
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    srli a2, a0, 1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shr, ptr %arr, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32  %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 8224
+; RV32I-NEXT:    addi a2, a2, 514
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a2, 8224
+; RV32ZBB-NEXT:    addi a2, a2, 514
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    slli a2, a0, 7
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shl, ptr %arr, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_not_used(i32  %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 8224
+; RV32I-NEXT:    addi a1, a1, 514
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 8224
+; RV32ZBB-NEXT:    addi a1, a1, 514
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_shl_used(i32  %x, ptr %arr){
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_shl_used:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 4112
+; CHECK-NEXT:    addi a2, a2, 257
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 8
+; CHECK-NEXT:    sub a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %shl = shl i32 %and, 8
+  store i32 %shl, ptr %arr, align 4
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_both_used(i32  %x, ptr %arr) {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b1_both_used:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 8224
+; CHECK-NEXT:    addi a2, a2, 514
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 7
+; CHECK-NEXT:    srli a3, a0, 1
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    sub a0, a2, a3
+; CHECK-NEXT:    sw a3, 4(a1)
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shl, ptr %arr, align 4
+  %arrayidx1 = getelementptr inbounds i8, ptr %arr, i32 4
+  store i32 %shr, ptr %arrayidx1, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_x_shr8x(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_x_shr8x:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 4112
+; CHECK-NEXT:    addi a1, a1, 257
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    srli a1, a0, 8
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %shr = lshr i32 %and, 8
+  %sub = sub nsw i32 %and, %shr
+  ret i32 %sub
+}

>From 3141c5dee88dc74b171940a713b2528220fcca6b Mon Sep 17 00:00:00 2001
From: Daniel Mokeev <mokeev.gh at gmail.com>
Date: Thu, 10 Oct 2024 13:43:53 +0200
Subject: [PATCH 2/3] [RISCV] Add DAG combine to turn (sub (shl X, 8-Y), (shr
 X, Y)) into orc.b

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 85 ++++++---------------
 llvm/test/CodeGen/RISCV/orc-b-patterns.ll   | 27 +++++++
 2 files changed, 50 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1619b684dd5818..ee948ca71a31c9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13574,8 +13574,11 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS);
 }
 
-// Looks for (sub (shl X, 8-N), (shr X, N)) where the N-th bit in each byte is potentially set. Replace with orc.b. 
-static SDValue combineSubShiftToOrcBGeneralized(SDNode *N, SelectionDAG &DAG,
+// Looks for (sub (shl X, 8-Y), (shr X, Y)) where the Y-th bit in each byte is
+// potentially set. It is fine for Y to be 0, meaning that (sub (shl X, 8), X)
+// is also valid. Replace with (orc.b X). For example, 0b0000_1000_0000_1000 is
+// valid with Y=3, while 0b0000_1000_0000_0100 is not.
+static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
                                      const RISCVSubtarget &Subtarget) {
   if (!Subtarget.hasStdExtZbb())
     return SDValue();
@@ -13583,86 +13586,46 @@ static SDValue combineSubShiftToOrcBGeneralized(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
 
   if (VT != Subtarget.getXLenVT() && VT != MVT::i32 && VT != MVT::i16)
-    return SDValue(); 
+    return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  
-  if (N0->getOpcode() != ISD::SHL){
+
+  if (N0->getOpcode() != ISD::SHL)
     return SDValue();
-  }  
   
   auto *ShAmtCLeft = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   if (!ShAmtCLeft)
     return SDValue();
   unsigned ShiftedAmount = 8 - ShAmtCLeft->getZExtValue();
   SDValue LeftShiftOperand = N0->getOperand(0);
-  SDValue RightShiftOperand;
+  SDValue RightShiftOperand = N1;
+
+  if (ShiftedAmount != 0 && N1->getOpcode() != ISD::SRL)
+    return SDValue();
 
-  if (N1->getOpcode() == ISD::SRL){ // (sub (shl X, 8 - N), (srl X, N)) case
+  if (ShiftedAmount != 0) { // Right operand must be a right shift.
     auto *ShAmtCRight = dyn_cast<ConstantSDNode>(N1.getOperand(1));
-    // Note that the (sub (X, (shr X, 8))) is a degenerate case that should not get optimized,
-    // as we would be replacing a subtraction with an orc.b
-    // (!N0.hasOneUse() && !N1.hasOneUse())
-    if (!ShAmtCRight || ShAmtCRight->getZExtValue() == 8 || ShAmtCRight->getZExtValue() != ShiftedAmount )
-    {
-        return SDValue();
-    }
-    if (!N0.hasOneUse() && !N1.hasOneUse()){
-      dbgs() << "Both operands both have > 1 use\n";
+    if (!ShAmtCRight || ShAmtCRight->getZExtValue() != ShiftedAmount)
       return SDValue();
-    }
     RightShiftOperand = N1.getOperand(0);
   }
-  else{ // (sub (shl X, 8), X) case
-    if (!N0.hasOneUse()){
-      dbgs() << "N0 has > 1 uses\n";
-      return SDValue();
-    }
-    // llvm::errs() << "N0 has " << N0->get;
-    RightShiftOperand = N1;
-  }
 
-  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0x1));
-  Mask <<= ShiftedAmount;
-  // Check that X has indeed the right shape (only the N-th bit can be set in every byte)
-  if(!DAG.MaskedValueIsZero(LeftShiftOperand, ~Mask))
+  // At least one shift should have a single use.
+  if (!N0.hasOneUse() && (ShiftedAmount == 0 || !N1.hasOneUse()))
     return SDValue();
 
   if (LeftShiftOperand != RightShiftOperand)
     return SDValue();
-  dbgs() << "Optimized for node - " << N->getDebugLoc() << "\n";
-  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, LeftShiftOperand);
-}
-
-
-// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are
-// non-zero. Replace with orc.b.
-__attribute__((unused)) static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
-                                     const RISCVSubtarget &Subtarget) {
-  if (!Subtarget.hasStdExtZbb())
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
 
-  if (VT != Subtarget.getXLenVT() && VT != MVT::i32 && VT != MVT::i16)
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse())
-    return SDValue();
-
-  auto *ShAmtC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-  if (!ShAmtC || ShAmtC->getZExtValue() != 8)
+  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0x1));
+  Mask <<= ShiftedAmount;
+  // Check that X has indeed the right shape (only the Y-th bit can be set in
+  // every byte).
+  if (!DAG.MaskedValueIsZero(LeftShiftOperand, ~Mask))
     return SDValue();
 
-  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe));
-  if (!DAG.MaskedValueIsZero(N1, Mask))
-    return SDValue(); 
-
-  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1);
+  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, LeftShiftOperand);
 }
 
 static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
@@ -13687,9 +13650,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
 
   if (SDValue V = combineBinOpOfZExt(N, DAG))
     return V;
-  // if (SDValue V = combineSubShiftToOrcB(N, DAG, Subtarget))
-  //   return V;
-  if (SDValue V  = combineSubShiftToOrcBGeneralized(N, DAG, Subtarget))
+  if (SDValue V = combineSubShiftToOrcB(N, DAG, Subtarget))
     return V;
 
   // fold (sub x, (select lhs, rhs, cc, 0, y)) ->
diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
index b7c67cf8930133..184e66c14b33fc 100644
--- a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
+++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
@@ -51,6 +51,33 @@ entry:
   ret i32 %sub
 }
 
+define i32 @orc_b_i32_sub_shl8x_x_lsb_preshifted(i32 %x){
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    srli a0, a0, 11
+; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    srli a0, a0, 11
+; RV32ZBB-NEXT:    lui a1, 16
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %shr = lshr i32 %x, 11
+  %and = and i32 %shr, 16843009
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+
 define  i32 @orc_b_i32_sub_shl8x_x_b1(i32  %x)  {
 ; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1:
 ; RV32I:       # %bb.0: # %entry

>From ef833f4e348177dca998fe2b4697e6595736b348 Mon Sep 17 00:00:00 2001
From: Daniel Mokeev <mokeev.gh at gmail.com>
Date: Thu, 10 Oct 2024 18:11:59 +0200
Subject: [PATCH 3/3] Fix typo, remove unused includes and add early for
 correctness

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ee948ca71a31c9..57010ab1adb3dd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23,14 +23,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -13598,6 +13596,10 @@ static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
   if (!ShAmtCLeft)
     return SDValue();
   unsigned ShiftedAmount = 8 - ShAmtCLeft->getZExtValue();
+
+  if (ShiftedAmount >= 8)
+    return SDValue();
+
   SDValue LeftShiftOperand = N0->getOperand(0);
   SDValue RightShiftOperand = N1;