[llvm] [RISCV] Add DAG combine to turn (sub (shl X, 8-Y), (shr X, Y)) into orc.b (PR #111828)

Fri Oct 11 05:33:19 PDT 2024

https://github.com/damokeev updated https://github.com/llvm/llvm-project/pull/111828

>From 320d370e5120c44a299ec6d9ec97ca16a462f71d Mon Sep 17 00:00:00 2001
From: Daniel Mokeev <mokeev.gh at gmail.com>
Date: Wed, 9 Oct 2024 18:14:50 +0200
Subject: [PATCH] [RISCV] Add DAG combine to turn (sub (shl X, 8-Y), (shr X,
 Y)) into orc.b

This patch generalizes the DAG combine for (sub (shl X, 8), X) => (orc.b X)
into the more general form of (sub (shl X, 8 - Y), (srl X, Y)) => (orc.b X).

Alive2 generalized proof: https://alive2.llvm.org/ce/z/dFcf_n
Related issue: https://github.com/llvm/llvm-project/issues/96595
Related PR: https://github.com/llvm/llvm-project/pull/96680
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  44 ++-
 llvm/test/CodeGen/RISCV/orc-b-patterns.ll   | 372 ++++++++++++++++++++
 2 files changed, 408 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/orc-b-patterns.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 01fa418e4dbdf4..bfabf87811c8f9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13572,8 +13572,10 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS);
 }
 
-// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are
-// non-zero. Replace with orc.b.
+// Looks for (sub (shl X, 8-Y), (shr X, Y)) where the Y-th bit in each byte is
+// potentially set. It is fine for Y to be 0, meaning that (sub (shl X, 8), X)
+// is also valid. Replace with (orc.b X). For example, 0b0000_1000_0000_1000 is
+// valid with Y=3, while 0b0000_1000_0000_0100 is not.
 static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
                                      const RISCVSubtarget &Subtarget) {
   if (!Subtarget.hasStdExtZbb())
@@ -13587,18 +13589,44 @@ static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse())
+  if (N0->getOpcode() != ISD::SHL)
     return SDValue();
 
-  auto *ShAmtC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-  if (!ShAmtC || ShAmtC->getZExtValue() != 8)
+  auto *ShAmtCLeft = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!ShAmtCLeft)
     return SDValue();
+  unsigned ShiftedAmount = 8 - ShAmtCLeft->getZExtValue();
 
-  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe));
-  if (!DAG.MaskedValueIsZero(N1, Mask))
+  if (ShiftedAmount >= 8)
     return SDValue();
 
-  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1);
+  SDValue LeftShiftOperand = N0->getOperand(0);
+  SDValue RightShiftOperand = N1;
+
+  if (ShiftedAmount != 0) { // Right operand must be a right shift.
+    if (N1->getOpcode() != ISD::SRL)
+      return SDValue();
+    auto *ShAmtCRight = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!ShAmtCRight || ShAmtCRight->getZExtValue() != ShiftedAmount)
+      return SDValue();
+    RightShiftOperand = N1.getOperand(0);
+  }
+
+  // At least one shift should have a single use.
+  if (!N0.hasOneUse() && (ShiftedAmount == 0 || !N1.hasOneUse()))
+    return SDValue();
+
+  if (LeftShiftOperand != RightShiftOperand)
+    return SDValue();
+
+  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0x1));
+  Mask <<= ShiftedAmount;
+  // Check that X has indeed the right shape (only the Y-th bit can be set in
+  // every byte).
+  if (!DAG.MaskedValueIsZero(LeftShiftOperand, ~Mask))
+    return SDValue();
+
+  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, LeftShiftOperand);
 }
 
 static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
new file mode 100644
index 00000000000000..184e66c14b33fc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
@@ -0,0 +1,372 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32ZBB
+
+define i32 @orc_b_i32_mul255(i32 %x) nounwind {
+; RV32I-LABEL: orc_b_i32_mul255:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_mul255:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 4112
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %mul = mul nuw nsw i32 %and, 255
+  ret i32 %mul
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_lsb(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 4112
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_lsb_preshifted(i32 %x){
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    srli a0, a0, 11
+; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    srli a0, a0, 11
+; RV32ZBB-NEXT:    lui a1, 16
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %shr = lshr i32 %x, 11
+  %and = and i32 %shr, 16843009
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b1(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 8224
+; RV32I-NEXT:    addi a1, a1, 514
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 8224
+; RV32ZBB-NEXT:    addi a1, a1, 514
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b2(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 16448
+; RV32I-NEXT:    addi a1, a1, 1028
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 6
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b2:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 16448
+; RV32ZBB-NEXT:    addi a1, a1, 1028
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 67372036
+  %shl = shl i32 %and, 6
+  %shr = lshr exact i32 %and, 2
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b3(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 24672
+; CHECK-NEXT:    addi a1, a1, 1542
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 101058054
+  %shl = shl nuw i32 %and, 5
+  %shr = lshr i32 %and, 3
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b4(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 32897
+; CHECK-NEXT:    addi a1, a1, -2040
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    srli a0, a0, 4
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 134744072
+  %shl = shl nuw i32 %and, 4
+  %shr = lshr i32 %and, 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b5(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 65793
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 5
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 269488144
+  %shl = shl nuw i32 %and, 3
+  %shr = lshr i32 %and, 5
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b6(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 131586
+; CHECK-NEXT:    addi a1, a1, 32
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 6
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 538976288
+  %shl = shl nuw i32 %and, 2
+  %shr = lshr i32 %and, 6
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b7(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 263172
+; CHECK-NEXT:    addi a1, a1, 64
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    srli a0, a0, 7
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 1077952576
+  %shl = shl nuw i32 %and, 1
+  %shr = lshr i32 %and, 7
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 8224
+; RV32I-NEXT:    addi a2, a2, 514
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 7
+; RV32I-NEXT:    srli a3, a0, 1
+; RV32I-NEXT:    sub a0, a2, a3
+; RV32I-NEXT:    sw a3, 0(a1)
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a2, 8224
+; RV32ZBB-NEXT:    addi a2, a2, 514
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    srli a2, a0, 1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shr, ptr %arr, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32  %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 8224
+; RV32I-NEXT:    addi a2, a2, 514
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a2, 8224
+; RV32ZBB-NEXT:    addi a2, a2, 514
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    slli a2, a0, 7
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shl, ptr %arr, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_not_used(i32  %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 8224
+; RV32I-NEXT:    addi a1, a1, 514
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 8224
+; RV32ZBB-NEXT:    addi a1, a1, 514
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_shl_used(i32  %x, ptr %arr){
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_shl_used:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 4112
+; CHECK-NEXT:    addi a2, a2, 257
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 8
+; CHECK-NEXT:    sub a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %shl = shl i32 %and, 8
+  store i32 %shl, ptr %arr, align 4
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_both_used(i32  %x, ptr %arr) {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b1_both_used:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 8224
+; CHECK-NEXT:    addi a2, a2, 514
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 7
+; CHECK-NEXT:    srli a3, a0, 1
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    sub a0, a2, a3
+; CHECK-NEXT:    sw a3, 4(a1)
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shl, ptr %arr, align 4
+  %arrayidx1 = getelementptr inbounds i8, ptr %arr, i32 4
+  store i32 %shr, ptr %arrayidx1, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_x_shr8x(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_x_shr8x:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 4112
+; CHECK-NEXT:    addi a1, a1, 257
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    srli a1, a0, 8
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %shr = lshr i32 %and, 8
+  %sub = sub nsw i32 %and, %shr
+  ret i32 %sub
+}