[llvm] [DAG] Matched Fixedwidth Pattern for ISD::AVGCEILU (PR #85031)

Wed Mar 13 00:15:14 PDT 2024

https://github.com/Sh0g0-1758 created https://github.com/llvm/llvm-project/pull/85031

Fixes: #84753

>From 69521a3ba63d0d45db081a8ea4e2e9cbacd9505c Mon Sep 17 00:00:00 2001
From: Sh0g0-1758 <shouryagoel10000 at gmail.com>
Date: Wed, 13 Mar 2024 11:25:00 +0530
Subject: [PATCH] Matched Fixed width Pattern for ISD::AVGCEILU

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/sub_combine.ll      | 34 +++++++++++++++++
 2 files changed, 71 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sub_combine.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 735cec8ecc0627..413112ef896328 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2529,6 +2529,39 @@ static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
 }
 
+// Attempt to form ext(avgceilu(A, B)) from sub(or(A, B), lshr(xor(A, B), 1))
+static SDValue combineFixedwidthToAVGCEILU(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::SUB and "SUB node is required here");
+  SDValue Or = N->getOperand(0);
+  SDValue Lshr = N->getOperand(1);
+  if (Or.getOpcode() != ISD::OR or Lshr.getOpcode() != ISD::SRL)
+    return SDValue();
+  SDValue Xor = Lshr.getOperand(0);
+  if (Xor.getOpcode() != ISD::XOR)
+    return SDValue();
+  SDValue Or1 = Or.getOperand(0);
+  SDValue Or2 = Or.getOperand(1);
+  SDValue Xor1 = Xor.getOperand(0);
+  SDValue Xor2 = Xor.getOperand(1);
+  if (Xor1 != Or1 or Xor2 != Or2)
+    return SDValue();
+  // Is the right shift using an immediate value of 1?
+  ConstantSDNode *N1C = isConstOrConstSplat(Lshr.getOperand(1));
+  if (!N1C or N1C->getAPIntValue() != 1)
+    return SDValue();
+  EVT VT = Or1.getValueType();
+  EVT NVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  if (VT.isVector())
+    VT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
+  else
+    VT = NVT;
+  SDLoc DL(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::AVGCEILU, VT))
+    return SDValue();
+  return DAG.getNode(ISD::AVGCEILU, DL, VT, Or1, Or2);
+}
+
 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
 /// a shift and add with a different constant.
 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
@@ -3859,6 +3892,10 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
+  // Try to match AVGCEILU fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVGCEILU(N, DAG))
+    return V;
+
   if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
     return V;
 
diff --git a/llvm/test/CodeGen/AArch64/sub_combine.ll b/llvm/test/CodeGen/AArch64/sub_combine.ll
new file mode 100644
index 00000000000000..f9df436a3db2a2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sub_combine.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -debugify-and-strip-all-safe -enable-machine-outliner=never -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+
+define i4 @sub_fixedwidth_i4(i4 %a0, i4 %a1)  {
+; CHECK-LABEL: sub_fixedwidth_i4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w0, w1
+; CHECK-NEXT:    orr w9, w0, w1
+; CHECK-NEXT:    and w8, w8, #0xe
+; CHECK-NEXT:    sub w0, w9, w8, lsr #1
+; CHECK-NEXT:    ret
+  %or = or i4 %a0, %a1
+  %xor = xor i4 %a0, %a1
+  %srl = lshr i4 %xor, 1
+  %res = sub i4 %or, %srl
+  ret i4 %res
+}
+
+define <4 x i32> @sub_fixedwidth_v4i32(<4 x i32> %a0, <4 x i32> %a1)  {
+; CHECK-LABEL: sub_fixedwidth_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %or = or <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a0, %a1
+  %srl = lshr <4 x i32> %xor, <i32 1,i32 1,i32 1,i32 1>
+  %res = sub <4 x i32> %or, %srl
+  ret <4 x i32> %res
+}
+
+