[llvm] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if they have s… (PR #152273)

Wed Aug 6 02:00:31 PDT 2025

https://github.com/houngkoungting created https://github.com/llvm/llvm-project/pull/152273

avgceil version :  https://alive2.llvm.org/ce/z/2CKrRh  
Fix #147773 , After several iterations, I believe this version is correct and complete. 

@RKSimon 

>From 80e303c6e0976d8c2437a806679a54d5919c5917 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Wed, 6 Aug 2025 16:17:48 +0800
Subject: [PATCH] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if
 they have sufficient leading zero/sign bits-1

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/trunc-avg-fold.ll   | 43 ++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/trunc-avg-fold.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d70e96938ed9a..9ff256f8090ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16294,6 +16294,51 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // because targets may prefer a wider type during later combines and invert
   // this transform.
   switch (N0.getOpcode()) {
+  case ISD::AVGCEILU:
+  case ISD::AVGFLOORU:
+    if (!LegalOperations && N0.hasOneUse() &&
+        TLI.isOperationLegal(N0.getOpcode(), VT)) {
+      SDValue X = N0.getOperand(0);
+      SDValue Y = N0.getOperand(1);
+
+      KnownBits KnownX = DAG.computeKnownBits(X);
+      KnownBits KnownY = DAG.computeKnownBits(Y);
+
+      unsigned SrcBits = X.getScalarValueSizeInBits();
+      unsigned DstBits = VT.getScalarSizeInBits();
+      unsigned NeededLeadingZeros = SrcBits - DstBits + 1;
+
+      if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros &&
+          KnownY.countMinLeadingZeros() >= NeededLeadingZeros) {
+        SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+        SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+        return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+      }
+    }
+    break;
+
+  case ISD::AVGCEILS:
+  case ISD::AVGFLOORS:
+    if (!LegalOperations && N0.hasOneUse() &&
+        TLI.isOperationLegal(N0.getOpcode(), VT)) {
+      SDValue X = N0.getOperand(0);
+      SDValue Y = N0.getOperand(1);
+
+      unsigned SignBitsX = DAG.ComputeNumSignBits(X);
+      unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
+
+      unsigned SrcBits = X.getScalarValueSizeInBits();
+      unsigned DstBits = VT.getScalarSizeInBits();
+      unsigned NeededSignBits = SrcBits - DstBits + 1;
+
+      if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
+        SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+        SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+        return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+      }
+    }
+    break;
+
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
new file mode 100644
index 0000000000000..175f54d6f9c05
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
+
+; CHECK-LABEL: test_avgceil_u
+; CHECK: uhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
+  %ta = trunc <8 x i16> %a to <8 x i8>
+  %tb = trunc <8 x i16> %b to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+  ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgceil_s
+; CHECK: shadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
+  %ta = trunc <8 x i16> %a to <8 x i8>
+  %tb = trunc <8 x i16> %b to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+  ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgfloor_u
+; CHECK: urhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
+  %ta = trunc <8 x i16> %a to <8 x i8>
+  %tb = trunc <8 x i16> %b to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+  ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgfloor_s
+; CHECK: srhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
+  %ta = trunc <8 x i16> %a to <8 x i8>
+  %tb = trunc <8 x i16> %b to <8 x i8>
+  %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+  ret <8 x i8> %res
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
+